Merge branch 'sched/urgent' into sched/core

author Ingo Molnar <mingo@kernel.org>

Sun, 18 Nov 2012 08:34:44 +0000 (09:34 +0100)

committer Ingo Molnar <mingo@kernel.org>

Sun, 18 Nov 2012 08:34:44 +0000 (09:34 +0100)
author Ingo Molnar <mingo@kernel.org>
Sun, 18 Nov 2012 08:34:44 +0000 (09:34 +0100)
committer Ingo Molnar <mingo@kernel.org>
Sun, 18 Nov 2012 08:34:44 +0000 (09:34 +0100)
diff --combined fs/proc/base.c

index 144a96732dd7d602df0d5f8a504e435cfdb07229,bb1d9623bad29030e687ec6657533e81f676ceac..5c1ad58c802827a90fbf8f98c36e596de5e6bda8
--- 1/fs/proc/base.c
--- 2/fs/proc/base.c
+++ b/fs/proc/base.c
@@@ -90,7 -90,6 +90,7 @@@
   #endif
   #include <trace/events/oom.h>
   #include "internal.h"
+ +#include "fd.h"
   
   /* NOTE:
    *    Implementing inode permission operations in /proc is almost
@@@ -137,6 -136,8 +137,6 @@@ struct pid_entry 
                 NULL, &proc_single_file_operations,     \
                 { .proc_show = show } )
   
- -static int proc_fd_permission(struct inode *inode, int mask);
- -
   /*
    * Count the number of hardlinks for the pid_entry table, excluding the .
    * and .. links.
@@@ -873,6 -874,111 +873,6 @@@ static const struct file_operations pro
         .release        = mem_release,
   };
   
- -static ssize_t oom_adjust_read(struct file *file, char __user *buf,
- -                              size_t count, loff_t *ppos)
- -{
- -      struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
- -      char buffer[PROC_NUMBUF];
- -      size_t len;
- -      int oom_adjust = OOM_DISABLE;
- -      unsigned long flags;
- -
- -      if (!task)
- -              return -ESRCH;
- -
- -      if (lock_task_sighand(task, &flags)) {
- -              oom_adjust = task->signal->oom_adj;
- -              unlock_task_sighand(task, &flags);
- -      }
- -
- -      put_task_struct(task);
- -
- -      len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust);
- -
- -      return simple_read_from_buffer(buf, count, ppos, buffer, len);
- -}
- -
- -static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
- -                              size_t count, loff_t *ppos)
- -{
- -      struct task_struct *task;
- -      char buffer[PROC_NUMBUF];
- -      int oom_adjust;
- -      unsigned long flags;
- -      int err;
- -
- -      memset(buffer, 0, sizeof(buffer));
- -      if (count > sizeof(buffer) - 1)
- -              count = sizeof(buffer) - 1;
- -      if (copy_from_user(buffer, buf, count)) {
- -              err = -EFAULT;
- -              goto out;
- -      }
- -
- -      err = kstrtoint(strstrip(buffer), 0, &oom_adjust);
- -      if (err)
- -              goto out;
- -      if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) &&
- -           oom_adjust != OOM_DISABLE) {
- -              err = -EINVAL;
- -              goto out;
- -      }
- -
- -      task = get_proc_task(file->f_path.dentry->d_inode);
- -      if (!task) {
- -              err = -ESRCH;
- -              goto out;
- -      }
- -
- -      task_lock(task);
- -      if (!task->mm) {
- -              err = -EINVAL;
- -              goto err_task_lock;
- -      }
- -
- -      if (!lock_task_sighand(task, &flags)) {
- -              err = -ESRCH;
- -              goto err_task_lock;
- -      }
- -
- -      if (oom_adjust < task->signal->oom_adj && !capable(CAP_SYS_RESOURCE)) {
- -              err = -EACCES;
- -              goto err_sighand;
- -      }
- -
- -      /*
- -       * Warn that /proc/pid/oom_adj is deprecated, see
- -       * Documentation/feature-removal-schedule.txt.
- -       */
- -      printk_once(KERN_WARNING "%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n",
- -                current->comm, task_pid_nr(current), task_pid_nr(task),
- -                task_pid_nr(task));
- -      task->signal->oom_adj = oom_adjust;
- -      /*
- -       * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum
- -       * value is always attainable.
- -       */
- -      if (task->signal->oom_adj == OOM_ADJUST_MAX)
- -              task->signal->oom_score_adj = OOM_SCORE_ADJ_MAX;
- -      else
- -              task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) /
- -                                                              -OOM_DISABLE;
- -      trace_oom_score_adj_update(task);
- -err_sighand:
- -      unlock_task_sighand(task, &flags);
- -err_task_lock:
- -      task_unlock(task);
- -      put_task_struct(task);
- -out:
- -      return err < 0 ? err : count;
- -}
- -
- -static const struct file_operations proc_oom_adjust_operations = {
- -      .read           = oom_adjust_read,
- -      .write          = oom_adjust_write,
- -      .llseek         = generic_file_llseek,
- -};
- -
   static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
                                         size_t count, loff_t *ppos)
   {
@@@ -946,7 -1052,15 +946,7 @@@ static ssize_t oom_score_adj_write(stru
         if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
                 task->signal->oom_score_adj_min = oom_score_adj;
         trace_oom_score_adj_update(task);
- -      /*
- -       * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is
- -       * always attainable.
- -       */
- -      if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
- -              task->signal->oom_adj = OOM_DISABLE;
- -      else
- -              task->signal->oom_adj = (oom_score_adj * OOM_ADJUST_MAX) /
- -                                                      OOM_SCORE_ADJ_MAX;
+ +
   err_sighand:
         unlock_task_sighand(task, &flags);
   err_task_lock:
@@@ -975,8 -1089,7 +975,8 @@@ static ssize_t proc_loginuid_read(struc
         if (!task)
                 return -ESRCH;
         length = scnprintf(tmpbuf, TMPBUFLEN, "%u",
- -                              audit_get_loginuid(task));
+ +                         from_kuid(file->f_cred->user_ns,
+ +                                   audit_get_loginuid(task)));
         put_task_struct(task);
         return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
   }
@@@ -988,7 -1101,6 +988,7 @@@ static ssize_t proc_loginuid_write(stru
         char *page, *tmp;
         ssize_t length;
         uid_t loginuid;
+ +      kuid_t kloginuid;
   
         rcu_read_lock();
         if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) {
@@@ -1018,13 -1130,7 +1018,13 @@@
                 goto out_free_page;
   
         }
- -      length = audit_set_loginuid(loginuid);
+ +      kloginuid = make_kuid(file->f_cred->user_ns, loginuid);
+ +      if (!uid_valid(kloginuid)) {
+ +              length = -EINVAL;
+ +              goto out_free_page;
+ +      }
+ +
+ +      length = audit_set_loginuid(kloginuid);
         if (likely(length == 0))
                 length = count;
   
@@@ -1165,81 -1271,6 +1165,6 @@@ static const struct file_operations pro
   
   #endif
   
- #ifdef CONFIG_SCHED_AUTOGROUP
- /*
-  * Print out autogroup related information:
-  */
- static int sched_autogroup_show(struct seq_file *m, void *v)
- {
-       struct inode *inode = m->private;
-       struct task_struct *p;
- 
-       p = get_proc_task(inode);
-       if (!p)
-               return -ESRCH;
-       proc_sched_autogroup_show_task(p, m);
- 
-       put_task_struct(p);
- 
-       return 0;
- }
- 
- static ssize_t
- sched_autogroup_write(struct file *file, const char __user *buf,
-           size_t count, loff_t *offset)
- {
-       struct inode *inode = file->f_path.dentry->d_inode;
-       struct task_struct *p;
-       char buffer[PROC_NUMBUF];
-       int nice;
-       int err;
- 
-       memset(buffer, 0, sizeof(buffer));
-       if (count > sizeof(buffer) - 1)
-               count = sizeof(buffer) - 1;
-       if (copy_from_user(buffer, buf, count))
-               return -EFAULT;
- 
-       err = kstrtoint(strstrip(buffer), 0, &nice);
-       if (err < 0)
-               return err;
- 
-       p = get_proc_task(inode);
-       if (!p)
-               return -ESRCH;
- 
-       err = proc_sched_autogroup_set_nice(p, nice);
-       if (err)
-               count = err;
- 
-       put_task_struct(p);
- 
-       return count;
- }
- 
- static int sched_autogroup_open(struct inode *inode, struct file *filp)
- {
-       int ret;
- 
-       ret = single_open(filp, sched_autogroup_show, NULL);
-       if (!ret) {
-               struct seq_file *m = filp->private_data;
- 
-               m->private = inode;
-       }
-       return ret;
- }
- 
- static const struct file_operations proc_pid_sched_autogroup_operations = {
-       .open           = sched_autogroup_open,
-       .read           = seq_read,
-       .write          = sched_autogroup_write,
-       .llseek         = seq_lseek,
-       .release        = single_release,
- };
- 
- #endif /* CONFIG_SCHED_AUTOGROUP */
- 
   static ssize_t comm_write(struct file *file, const char __user *buf,
                                 size_t count, loff_t *offset)
   {
@@@ -1386,7 -1417,7 +1311,7 @@@ out
         return error;
   }
   
- -static const struct inode_operations proc_pid_link_inode_operations = {
+ +const struct inode_operations proc_pid_link_inode_operations = {
         .readlink       = proc_pid_readlink,
         .follow_link    = proc_pid_follow_link,
         .setattr        = proc_setattr,
@@@ -1395,6 -1426,21 +1320,6 @@@
   
   /* building an inode */
   
- -static int task_dumpable(struct task_struct *task)
- -{
- -      int dumpable = 0;
- -      struct mm_struct *mm;
- -
- -      task_lock(task);
- -      mm = task->mm;
- -      if (mm)
- -              dumpable = get_dumpable(mm);
- -      task_unlock(task);
- -      if(dumpable == 1)
- -              return 1;
- -      return 0;
- -}
- -
   struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task)
   {
         struct inode * inode;
@@@ -1520,6 -1566,15 +1445,6 @@@ int pid_revalidate(struct dentry *dentr
         return 0;
   }
   
- -static int pid_delete_dentry(const struct dentry * dentry)
- -{
- -      /* Is the task we represent dead?
- -       * If so, then don't put the dentry on the lru list,
- -       * kill it immediately.
- -       */
- -      return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first;
- -}
- -
   const struct dentry_operations pid_dentry_operations =
   {
         .d_revalidate   = pid_revalidate,
@@@ -1582,6 -1637,289 +1507,6 @@@ end_instantiate
         return filldir(dirent, name, len, filp->f_pos, ino, type);
   }
   
- -static unsigned name_to_int(struct dentry *dentry)
- -{
- -      const char *name = dentry->d_name.name;
- -      int len = dentry->d_name.len;
- -      unsigned n = 0;
- -
- -      if (len > 1 && *name == '0')
- -              goto out;
- -      while (len-- > 0) {
- -              unsigned c = *name++ - '0';
- -              if (c > 9)
- -                      goto out;
- -              if (n >= (~0U-9)/10)
- -                      goto out;
- -              n *= 10;
- -              n += c;
- -      }
- -      return n;
- -out:
- -      return ~0U;
- -}
- -
- -#define PROC_FDINFO_MAX 64
- -
- -static int proc_fd_info(struct inode *inode, struct path *path, char *info)
- -{
- -      struct task_struct *task = get_proc_task(inode);
- -      struct files_struct *files = NULL;
- -      struct file *file;
- -      int fd = proc_fd(inode);
- -
- -      if (task) {
- -              files = get_files_struct(task);
- -              put_task_struct(task);
- -      }
- -      if (files) {
- -              /*
- -               * We are not taking a ref to the file structure, so we must
- -               * hold ->file_lock.
- -               */
- -              spin_lock(&files->file_lock);
- -              file = fcheck_files(files, fd);
- -              if (file) {
- -                      unsigned int f_flags;
- -                      struct fdtable *fdt;
- -
- -                      fdt = files_fdtable(files);
- -                      f_flags = file->f_flags & ~O_CLOEXEC;
- -                      if (close_on_exec(fd, fdt))
- -                              f_flags |= O_CLOEXEC;
- -
- -                      if (path) {
- -                              *path = file->f_path;
- -                              path_get(&file->f_path);
- -                      }
- -                      if (info)
- -                              snprintf(info, PROC_FDINFO_MAX,
- -                                       "pos:\t%lli\n"
- -                                       "flags:\t0%o\n",
- -                                       (long long) file->f_pos,
- -                                       f_flags);
- -                      spin_unlock(&files->file_lock);
- -                      put_files_struct(files);
- -                      return 0;
- -              }
- -              spin_unlock(&files->file_lock);
- -              put_files_struct(files);
- -      }
- -      return -ENOENT;
- -}
- -
- -static int proc_fd_link(struct dentry *dentry, struct path *path)
- -{
- -      return proc_fd_info(dentry->d_inode, path, NULL);
- -}
- -
- -static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags)
- -{
- -      struct inode *inode;
- -      struct task_struct *task;
- -      int fd;
- -      struct files_struct *files;
- -      const struct cred *cred;
- -
- -      if (flags & LOOKUP_RCU)
- -              return -ECHILD;
- -
- -      inode = dentry->d_inode;
- -      task = get_proc_task(inode);
- -      fd = proc_fd(inode);
- -
- -      if (task) {
- -              files = get_files_struct(task);
- -              if (files) {
- -                      struct file *file;
- -                      rcu_read_lock();
- -                      file = fcheck_files(files, fd);
- -                      if (file) {
- -                              unsigned f_mode = file->f_mode;
- -
- -                              rcu_read_unlock();
- -                              put_files_struct(files);
- -
- -                              if (task_dumpable(task)) {
- -                                      rcu_read_lock();
- -                                      cred = __task_cred(task);
- -                                      inode->i_uid = cred->euid;
- -                                      inode->i_gid = cred->egid;
- -                                      rcu_read_unlock();
- -                              } else {
- -                                      inode->i_uid = GLOBAL_ROOT_UID;
- -                                      inode->i_gid = GLOBAL_ROOT_GID;
- -                              }
- -
- -                              if (S_ISLNK(inode->i_mode)) {
- -                                      unsigned i_mode = S_IFLNK;
- -                                      if (f_mode & FMODE_READ)
- -                                              i_mode |= S_IRUSR | S_IXUSR;
- -                                      if (f_mode & FMODE_WRITE)
- -                                              i_mode |= S_IWUSR | S_IXUSR;
- -                                      inode->i_mode = i_mode;
- -                              }
- -
- -                              security_task_to_inode(task, inode);
- -                              put_task_struct(task);
- -                              return 1;
- -                      }
- -                      rcu_read_unlock();
- -                      put_files_struct(files);
- -              }
- -              put_task_struct(task);
- -      }
- -      d_drop(dentry);
- -      return 0;
- -}
- -
- -static const struct dentry_operations tid_fd_dentry_operations =
- -{
- -      .d_revalidate   = tid_fd_revalidate,
- -      .d_delete       = pid_delete_dentry,
- -};
- -
- -static struct dentry *proc_fd_instantiate(struct inode *dir,
- -      struct dentry *dentry, struct task_struct *task, const void *ptr)
- -{
- -      unsigned fd = (unsigned long)ptr;
- -      struct inode *inode;
- -      struct proc_inode *ei;
- -      struct dentry *error = ERR_PTR(-ENOENT);
- -
- -      inode = proc_pid_make_inode(dir->i_sb, task);
- -      if (!inode)
- -              goto out;
- -      ei = PROC_I(inode);
- -      ei->fd = fd;
- -
- -      inode->i_mode = S_IFLNK;
- -      inode->i_op = &proc_pid_link_inode_operations;
- -      inode->i_size = 64;
- -      ei->op.proc_get_link = proc_fd_link;
- -      d_set_d_op(dentry, &tid_fd_dentry_operations);
- -      d_add(dentry, inode);
- -      /* Close the race of the process dying before we return the dentry */
- -      if (tid_fd_revalidate(dentry, 0))
- -              error = NULL;
- -
- - out:
- -      return error;
- -}
- -
- -static struct dentry *proc_lookupfd_common(struct inode *dir,
- -                                         struct dentry *dentry,
- -                                         instantiate_t instantiate)
- -{
- -      struct task_struct *task = get_proc_task(dir);
- -      unsigned fd = name_to_int(dentry);
- -      struct dentry *result = ERR_PTR(-ENOENT);
- -
- -      if (!task)
- -              goto out_no_task;
- -      if (fd == ~0U)
- -              goto out;
- -
- -      result = instantiate(dir, dentry, task, (void *)(unsigned long)fd);
- -out:
- -      put_task_struct(task);
- -out_no_task:
- -      return result;
- -}
- -
- -static int proc_readfd_common(struct file * filp, void * dirent,
- -                            filldir_t filldir, instantiate_t instantiate)
- -{
- -      struct dentry *dentry = filp->f_path.dentry;
- -      struct inode *inode = dentry->d_inode;
- -      struct task_struct *p = get_proc_task(inode);
- -      unsigned int fd, ino;
- -      int retval;
- -      struct files_struct * files;
- -
- -      retval = -ENOENT;
- -      if (!p)
- -              goto out_no_task;
- -      retval = 0;
- -
- -      fd = filp->f_pos;
- -      switch (fd) {
- -              case 0:
- -                      if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0)
- -                              goto out;
- -                      filp->f_pos++;
- -              case 1:
- -                      ino = parent_ino(dentry);
- -                      if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
- -                              goto out;
- -                      filp->f_pos++;
- -              default:
- -                      files = get_files_struct(p);
- -                      if (!files)
- -                              goto out;
- -                      rcu_read_lock();
- -                      for (fd = filp->f_pos-2;
- -                           fd < files_fdtable(files)->max_fds;
- -                           fd++, filp->f_pos++) {
- -                              char name[PROC_NUMBUF];
- -                              int len;
- -                              int rv;
- -
- -                              if (!fcheck_files(files, fd))
- -                                      continue;
- -                              rcu_read_unlock();
- -
- -                              len = snprintf(name, sizeof(name), "%d", fd);
- -                              rv = proc_fill_cache(filp, dirent, filldir,
- -                                                   name, len, instantiate, p,
- -                                                   (void *)(unsigned long)fd);
- -                              if (rv < 0)
- -                                      goto out_fd_loop;
- -                              rcu_read_lock();
- -                      }
- -                      rcu_read_unlock();
- -out_fd_loop:
- -                      put_files_struct(files);
- -      }
- -out:
- -      put_task_struct(p);
- -out_no_task:
- -      return retval;
- -}
- -
- -static struct dentry *proc_lookupfd(struct inode *dir, struct dentry *dentry,
- -                                  unsigned int flags)
- -{
- -      return proc_lookupfd_common(dir, dentry, proc_fd_instantiate);
- -}
- -
- -static int proc_readfd(struct file *filp, void *dirent, filldir_t filldir)
- -{
- -      return proc_readfd_common(filp, dirent, filldir, proc_fd_instantiate);
- -}
- -
- -static ssize_t proc_fdinfo_read(struct file *file, char __user *buf,
- -                                    size_t len, loff_t *ppos)
- -{
- -      char tmp[PROC_FDINFO_MAX];
- -      int err = proc_fd_info(file->f_path.dentry->d_inode, NULL, tmp);
- -      if (!err)
- -              err = simple_read_from_buffer(buf, len, ppos, tmp, strlen(tmp));
- -      return err;
- -}
- -
- -static const struct file_operations proc_fdinfo_file_operations = {
- -      .open           = nonseekable_open,
- -      .read           = proc_fdinfo_read,
- -      .llseek         = no_llseek,
- -};
- -
- -static const struct file_operations proc_fd_operations = {
- -      .read           = generic_read_dir,
- -      .readdir        = proc_readfd,
- -      .llseek         = default_llseek,
- -};
- -
   #ifdef CONFIG_CHECKPOINT_RESTORE
   
   /*
@@@ -1700,7 -2038,7 +1625,7 @@@ out
   }
   
   struct map_files_info {
- -      struct file     *file;
+ +      fmode_t         mode;
         unsigned long   len;
         unsigned char   name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
   };
@@@ -1709,10 -2047,13 +1634,10 @@@ static struct dentry 
   proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
                            struct task_struct *task, const void *ptr)
   {
- -      const struct file *file = ptr;
+ +      fmode_t mode = (fmode_t)(unsigned long)ptr;
         struct proc_inode *ei;
         struct inode *inode;
   
- -      if (!file)
- -              return ERR_PTR(-ENOENT);
- -
         inode = proc_pid_make_inode(dir->i_sb, task);
         if (!inode)
                 return ERR_PTR(-ENOENT);
@@@ -1724,9 -2065,9 +1649,9 @@@
         inode->i_size = 64;
         inode->i_mode = S_IFLNK;
   
- -      if (file->f_mode & FMODE_READ)
+ +      if (mode & FMODE_READ)
                 inode->i_mode |= S_IRUSR;
- -      if (file->f_mode & FMODE_WRITE)
+ +      if (mode & FMODE_WRITE)
                 inode->i_mode |= S_IWUSR;
   
         d_set_d_op(dentry, &tid_map_files_dentry_operations);
@@@ -1770,8 -2111,7 +1695,8 @@@ static struct dentry *proc_map_files_lo
         if (!vma)
                 goto out_no_vma;
   
- -      result = proc_map_files_instantiate(dir, dentry, task, vma->vm_file);
+ +      result = proc_map_files_instantiate(dir, dentry, task,
+ +                      (void *)(unsigned long)vma->vm_file->f_mode);
   
   out_no_vma:
         up_read(&mm->mmap_sem);
@@@ -1872,7 -2212,8 +1797,7 @@@ proc_map_files_readdir(struct file *fil
                                 if (++pos <= filp->f_pos)
                                         continue;
   
- -                              get_file(vma->vm_file);
- -                              info.file = vma->vm_file;
+ +                              info.mode = vma->vm_file->f_mode;
                                 info.len = snprintf(info.name,
                                                 sizeof(info.name), "%lx-%lx",
                                                 vma->vm_start, vma->vm_end);
@@@ -1887,11 -2228,19 +1812,11 @@@
                         ret = proc_fill_cache(filp, dirent, filldir,
                                               p->name, p->len,
                                               proc_map_files_instantiate,
- -                                            task, p->file);
+ +                                            task,
+ +                                            (void *)(unsigned long)p->mode);
                         if (ret)
                                 break;
                         filp->f_pos++;
- -                      fput(p->file);
- -              }
- -              for (; i < nr_files; i++) {
- -                      /*
- -                       * In case of error don't forget
- -                       * to put rest of file refs.
- -                       */
- -                      p = flex_array_get(fa, i);
- -                      fput(p->file);
                 }
                 if (fa)
                         flex_array_free(fa);
@@@ -1913,6 -2262,82 +1838,6 @@@ static const struct file_operations pro
   
   #endif /* CONFIG_CHECKPOINT_RESTORE */
   
- -/*
- - * /proc/pid/fd needs a special permission handler so that a process can still
- - * access /proc/self/fd after it has executed a setuid().
- - */
- -static int proc_fd_permission(struct inode *inode, int mask)
- -{
- -      int rv = generic_permission(inode, mask);
- -      if (rv == 0)
- -              return 0;
- -      if (task_pid(current) == proc_pid(inode))
- -              rv = 0;
- -      return rv;
- -}
- -
- -/*
- - * proc directories can do almost nothing..
- - */
- -static const struct inode_operations proc_fd_inode_operations = {
- -      .lookup         = proc_lookupfd,
- -      .permission     = proc_fd_permission,
- -      .setattr        = proc_setattr,
- -};
- -
- -static struct dentry *proc_fdinfo_instantiate(struct inode *dir,
- -      struct dentry *dentry, struct task_struct *task, const void *ptr)
- -{
- -      unsigned fd = (unsigned long)ptr;
- -      struct inode *inode;
- -      struct proc_inode *ei;
- -      struct dentry *error = ERR_PTR(-ENOENT);
- -
- -      inode = proc_pid_make_inode(dir->i_sb, task);
- -      if (!inode)
- -              goto out;
- -      ei = PROC_I(inode);
- -      ei->fd = fd;
- -      inode->i_mode = S_IFREG | S_IRUSR;
- -      inode->i_fop = &proc_fdinfo_file_operations;
- -      d_set_d_op(dentry, &tid_fd_dentry_operations);
- -      d_add(dentry, inode);
- -      /* Close the race of the process dying before we return the dentry */
- -      if (tid_fd_revalidate(dentry, 0))
- -              error = NULL;
- -
- - out:
- -      return error;
- -}
- -
- -static struct dentry *proc_lookupfdinfo(struct inode *dir,
- -                                      struct dentry *dentry,
- -                                      unsigned int flags)
- -{
- -      return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate);
- -}
- -
- -static int proc_readfdinfo(struct file *filp, void *dirent, filldir_t filldir)
- -{
- -      return proc_readfd_common(filp, dirent, filldir,
- -                                proc_fdinfo_instantiate);
- -}
- -
- -static const struct file_operations proc_fdinfo_operations = {
- -      .read           = generic_read_dir,
- -      .readdir        = proc_readfdinfo,
- -      .llseek         = default_llseek,
- -};
- -
- -/*
- - * proc directories can do almost nothing..
- - */
- -static const struct inode_operations proc_fdinfo_inode_operations = {
- -      .lookup         = proc_lookupfdinfo,
- -      .setattr        = proc_setattr,
- -};
- -
- -
   static struct dentry *proc_pident_instantiate(struct inode *dir,
         struct dentry *dentry, struct task_struct *task, const void *ptr)
   {
@@@ -2258,8 -2683,7 +2183,8 @@@ static void *proc_self_follow_link(stru
         pid_t tgid = task_tgid_nr_ns(current, ns);
         char *name = ERR_PTR(-ENOENT);
         if (tgid) {
- -              name = __getname();
+ +              /* 11 for max length of signed int in decimal + NULL term */
+ +              name = kmalloc(12, GFP_KERNEL);
                 if (!name)
                         name = ERR_PTR(-ENOMEM);
                 else
@@@ -2274,7 -2698,7 +2199,7 @@@ static void proc_self_put_link(struct d
   {
         char *s = nd_get_link(nd);
         if (!IS_ERR(s))
- -              __putname(s);
+ +              kfree(s);
   }
   
   static const struct inode_operations proc_self_inode_operations = {
@@@ -2484,11 -2908,6 +2409,11 @@@ static int proc_gid_map_open(struct ino
         return proc_id_map_open(inode, file, &proc_gid_seq_operations);
   }
   
+ +static int proc_projid_map_open(struct inode *inode, struct file *file)
+ +{
+ +      return proc_id_map_open(inode, file, &proc_projid_seq_operations);
+ +}
+ +
   static const struct file_operations proc_uid_map_operations = {
         .open           = proc_uid_map_open,
         .write          = proc_uid_map_write,
@@@ -2504,14 -2923,6 +2429,14 @@@ static const struct file_operations pro
         .llseek         = seq_lseek,
         .release        = proc_id_map_release,
   };
+ +
+ +static const struct file_operations proc_projid_map_operations = {
+ +      .open           = proc_projid_map_open,
+ +      .write          = proc_projid_map_write,
+ +      .read           = seq_read,
+ +      .llseek         = seq_lseek,
+ +      .release        = proc_id_map_release,
+ +};
   #endif /* CONFIG_USER_NS */
   
   static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns,
@@@ -2549,9 -2960,6 +2474,6 @@@ static const struct pid_entry tgid_base
         INF("limits",     S_IRUGO, proc_pid_limits),
   #ifdef CONFIG_SCHED_DEBUG
         REG("sched",      S_IRUGO|S_IWUSR, proc_pid_sched_operations),
- #endif
- #ifdef CONFIG_SCHED_AUTOGROUP
-       REG("autogroup",  S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations),
   #endif
         REG("comm",      S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
   #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
@@@ -2598,6 -3006,7 +2520,6 @@@
         REG("cgroup",  S_IRUGO, proc_cgroup_operations),
   #endif
         INF("oom_score",  S_IRUGO, proc_oom_score),
- -      REG("oom_adj",    S_IRUGO|S_IWUSR, proc_oom_adjust_operations),
         REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
   #ifdef CONFIG_AUDITSYSCALL
         REG("loginuid",   S_IWUSR|S_IRUGO, proc_loginuid_operations),
@@@ -2618,7 -3027,6 +2540,7 @@@
   #ifdef CONFIG_USER_NS
         REG("uid_map",    S_IRUGO|S_IWUSR, proc_uid_map_operations),
         REG("gid_map",    S_IRUGO|S_IWUSR, proc_gid_map_operations),
+ +      REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
   #endif
   };
   
@@@ -2964,6 -3372,7 +2886,6 @@@ static const struct pid_entry tid_base_
         REG("cgroup",  S_IRUGO, proc_cgroup_operations),
   #endif
         INF("oom_score", S_IRUGO, proc_oom_score),
- -      REG("oom_adj",   S_IRUGO|S_IWUSR, proc_oom_adjust_operations),
         REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
   #ifdef CONFIG_AUDITSYSCALL
         REG("loginuid",  S_IWUSR|S_IRUGO, proc_loginuid_operations),
@@@ -2981,7 -3390,6 +2903,7 @@@
   #ifdef CONFIG_USER_NS
         REG("uid_map",    S_IRUGO|S_IWUSR, proc_uid_map_operations),
         REG("gid_map",    S_IRUGO|S_IWUSR, proc_gid_map_operations),
+ +      REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
   #endif
   };
   
diff --combined kernel/sched/fair.c

index a319d56c760507ec8477c3e7662d11a16fec12b1,f936552b3db1a400db1f2a01c5fe47329aed8b96..59e072b2db970b80eb2eac7b435d297305640e2a
--- 1/kernel/sched/fair.c
--- 2/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@@ -259,9 -259,6 +259,9 @@@ static inline struct cfs_rq *group_cfs_
         return grp->my_q;
   }
   
+ +static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
+ +                                     int force_update);
+ +
   static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
   {
         if (!cfs_rq->on_list) {
@@@ -281,8 -278,6 +281,8 @@@
                 }
   
                 cfs_rq->on_list = 1;
+ +              /* We should have no load, but we need to update last_decay. */
+ +              update_cfs_rq_blocked_load(cfs_rq, 0);
         }
   }
   
@@@ -658,6 -653,9 +658,6 @@@ static u64 sched_vslice(struct cfs_rq *
         return calc_delta_fair(sched_slice(cfs_rq, se), se);
   }
   
- -static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
- -static void update_cfs_shares(struct cfs_rq *cfs_rq);
- -
   /*
    * Update the current task's runtime statistics. Skip current tasks that
    * are not in our scheduling class.
@@@ -677,6 -675,10 +677,6 @@@ __update_curr(struct cfs_rq *cfs_rq, st
   
         curr->vruntime += delta_exec_weighted;
         update_min_vruntime(cfs_rq);
- -
- -#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
- -      cfs_rq->load_unacc_exec_time += delta_exec;
- -#endif
   }
   
   static void update_curr(struct cfs_rq *cfs_rq)
@@@ -799,7 -801,72 +799,7 @@@ account_entity_dequeue(struct cfs_rq *c
   }
   
   #ifdef CONFIG_FAIR_GROUP_SCHED
- -/* we need this in update_cfs_load and load-balance functions below */
- -static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
   # ifdef CONFIG_SMP
- -static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
- -                                          int global_update)
- -{
- -      struct task_group *tg = cfs_rq->tg;
- -      long load_avg;
- -
- -      load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
- -      load_avg -= cfs_rq->load_contribution;
- -
- -      if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
- -              atomic_add(load_avg, &tg->load_weight);
- -              cfs_rq->load_contribution += load_avg;
- -      }
- -}
- -
- -static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
- -{
- -      u64 period = sysctl_sched_shares_window;
- -      u64 now, delta;
- -      unsigned long load = cfs_rq->load.weight;
- -
- -      if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq))
- -              return;
- -
- -      now = rq_of(cfs_rq)->clock_task;
- -      delta = now - cfs_rq->load_stamp;
- -
- -      /* truncate load history at 4 idle periods */
- -      if (cfs_rq->load_stamp > cfs_rq->load_last &&
- -          now - cfs_rq->load_last > 4 * period) {
- -              cfs_rq->load_period = 0;
- -              cfs_rq->load_avg = 0;
- -              delta = period - 1;
- -      }
- -
- -      cfs_rq->load_stamp = now;
- -      cfs_rq->load_unacc_exec_time = 0;
- -      cfs_rq->load_period += delta;
- -      if (load) {
- -              cfs_rq->load_last = now;
- -              cfs_rq->load_avg += delta * load;
- -      }
- -
- -      /* consider updating load contribution on each fold or truncate */
- -      if (global_update || cfs_rq->load_period > period
- -          || !cfs_rq->load_period)
- -              update_cfs_rq_load_contribution(cfs_rq, global_update);
- -
- -      while (cfs_rq->load_period > period) {
- -              /*
- -               * Inline assembly required to prevent the compiler
- -               * optimising this loop into a divmod call.
- -               * See __iter_div_u64_rem() for another example of this.
- -               */
- -              asm("" : "+rm" (cfs_rq->load_period));
- -              cfs_rq->load_period /= 2;
- -              cfs_rq->load_avg /= 2;
- -      }
- -
- -      if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg)
- -              list_del_leaf_cfs_rq(cfs_rq);
- -}
- -
   static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
   {
         long tg_weight;
@@@ -809,8 -876,8 +809,8 @@@
          * to gain a more accurate current total weight. See
          * update_cfs_rq_load_contribution().
          */
- -      tg_weight = atomic_read(&tg->load_weight);
- -      tg_weight -= cfs_rq->load_contribution;
+ +      tg_weight = atomic64_read(&tg->load_avg);
+ +      tg_weight -= cfs_rq->tg_load_contrib;
         tg_weight += cfs_rq->load.weight;
   
         return tg_weight;
@@@ -834,11 -901,27 +834,11 @@@ static long calc_cfs_shares(struct cfs_
   
         return shares;
   }
- -
- -static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
- -{
- -      if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
- -              update_cfs_load(cfs_rq, 0);
- -              update_cfs_shares(cfs_rq);
- -      }
- -}
   # else /* CONFIG_SMP */
- -static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
- -{
- -}
- -
   static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
   {
         return tg->shares;
   }
- -
- -static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
- -{
- -}
   # endif /* CONFIG_SMP */
   static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
                             unsigned long weight)
@@@ -856,8 -939,6 +856,8 @@@
                 account_entity_enqueue(cfs_rq, se);
   }
   
+ +static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
+ +
   static void update_cfs_shares(struct cfs_rq *cfs_rq)
   {
         struct task_group *tg;
@@@ -877,478 -958,18 +877,478 @@@
         reweight_entity(cfs_rq_of(se), se, shares);
   }
   #else /* CONFIG_FAIR_GROUP_SCHED */
- -static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
+ +static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
   {
   }
+ +#endif /* CONFIG_FAIR_GROUP_SCHED */
   
- -static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
+ +/* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */
+ +#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
+ +/*
+ + * We choose a half-life close to 1 scheduling period.
+ + * Note: The tables below are dependent on this value.
+ + */
+ +#define LOAD_AVG_PERIOD 32
+ +#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
+ +#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
+ +
+ +/* Precomputed fixed inverse multiplies for multiplication by y^n */
+ +static const u32 runnable_avg_yN_inv[] = {
+ +      0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
+ +      0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
+ +      0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
+ +      0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
+ +      0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
+ +      0x85aac367, 0x82cd8698,
+ +};
+ +
+ +/*
+ + * Precomputed \Sum y^k { 1<=k<=n }.  These are floor(true_value) to prevent
+ + * over-estimates when re-combining.
+ + */
+ +static const u32 runnable_avg_yN_sum[] = {
+ +          0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
+ +       9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
+ +      17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
+ +};
+ +
+ +/*
+ + * Approximate:
+ + *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
+ + */
+ +static __always_inline u64 decay_load(u64 val, u64 n)
   {
+ +      unsigned int local_n;
+ +
+ +      if (!n)
+ +              return val;
+ +      else if (unlikely(n > LOAD_AVG_PERIOD * 63))
+ +              return 0;
+ +
+ +      /* after bounds checking we can collapse to 32-bit */
+ +      local_n = n;
+ +
+ +      /*
+ +       * As y^PERIOD = 1/2, we can combine
+ +       *    y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)
+ +       * With a look-up table which covers k^n (n<PERIOD)
+ +       *
+ +       * To achieve constant time decay_load.
+ +       */
+ +      if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
+ +              val >>= local_n / LOAD_AVG_PERIOD;
+ +              local_n %= LOAD_AVG_PERIOD;
+ +      }
+ +
+ +      val *= runnable_avg_yN_inv[local_n];
+ +      /* We don't use SRR here since we always want to round down. */
+ +      return val >> 32;
   }
   
- -static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
+ +/*
+ + * For updates fully spanning n periods, the contribution to runnable
+ + * average will be: \Sum 1024*y^n
+ + *
+ + * We can compute this reasonably efficiently by combining:
+ + *   y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for  n <PERIOD}
+ + */
+ +static u32 __compute_runnable_contrib(u64 n)
   {
+ +      u32 contrib = 0;
+ +
+ +      if (likely(n <= LOAD_AVG_PERIOD))
+ +              return runnable_avg_yN_sum[n];
+ +      else if (unlikely(n >= LOAD_AVG_MAX_N))
+ +              return LOAD_AVG_MAX;
+ +
+ +      /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
+ +      do {
+ +              contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
+ +              contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
+ +
+ +              n -= LOAD_AVG_PERIOD;
+ +      } while (n > LOAD_AVG_PERIOD);
+ +
+ +      contrib = decay_load(contrib, n);
+ +      return contrib + runnable_avg_yN_sum[n];
   }
- -#endif /* CONFIG_FAIR_GROUP_SCHED */
+ +
+ +/*
+ + * We can represent the historical contribution to runnable average as the
+ + * coefficients of a geometric series.  To do this we sub-divide our runnable
+ + * history into segments of approximately 1ms (1024us); label the segment that
+ + * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
+ + *
+ + * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
+ + *      p0            p1           p2
+ + *     (now)       (~1ms ago)  (~2ms ago)
+ + *
+ + * Let u_i denote the fraction of p_i that the entity was runnable.
+ + *
+ + * We then designate the fractions u_i as our co-efficients, yielding the
+ + * following representation of historical load:
+ + *   u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
+ + *
+ + * We choose y based on the with of a reasonably scheduling period, fixing:
+ + *   y^32 = 0.5
+ + *
+ + * This means that the contribution to load ~32ms ago (u_32) will be weighted
+ + * approximately half as much as the contribution to load within the last ms
+ + * (u_0).
+ + *
+ + * When a period "rolls over" and we have new u_0`, multiplying the previous
+ + * sum again by y is sufficient to update:
+ + *   load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
+ + *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
+ + */
+ +static __always_inline int __update_entity_runnable_avg(u64 now,
+ +                                                      struct sched_avg *sa,
+ +                                                      int runnable)
+ +{
+ +      u64 delta, periods;
+ +      u32 runnable_contrib;
+ +      int delta_w, decayed = 0;
+ +
+ +      delta = now - sa->last_runnable_update;
+ +      /*
+ +       * This should only happen when time goes backwards, which it
+ +       * unfortunately does during sched clock init when we swap over to TSC.
+ +       */
+ +      if ((s64)delta < 0) {
+ +              sa->last_runnable_update = now;
+ +              return 0;
+ +      }
+ +
+ +      /*
+ +       * Use 1024ns as the unit of measurement since it's a reasonable
+ +       * approximation of 1us and fast to compute.
+ +       */
+ +      delta >>= 10;
+ +      if (!delta)
+ +              return 0;
+ +      sa->last_runnable_update = now;
+ +
+ +      /* delta_w is the amount already accumulated against our next period */
+ +      delta_w = sa->runnable_avg_period % 1024;
+ +      if (delta + delta_w >= 1024) {
+ +              /* period roll-over */
+ +              decayed = 1;
+ +
+ +              /*
+ +               * Now that we know we're crossing a period boundary, figure
+ +               * out how much from delta we need to complete the current
+ +               * period and accrue it.
+ +               */
+ +              delta_w = 1024 - delta_w;
+ +              if (runnable)
+ +                      sa->runnable_avg_sum += delta_w;
+ +              sa->runnable_avg_period += delta_w;
+ +
+ +              delta -= delta_w;
+ +
+ +              /* Figure out how many additional periods this update spans */
+ +              periods = delta / 1024;
+ +              delta %= 1024;
+ +
+ +              sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
+ +                                                periods + 1);
+ +              sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
+ +                                                   periods + 1);
+ +
+ +              /* Efficiently calculate \sum (1..n_period) 1024*y^i */
+ +              runnable_contrib = __compute_runnable_contrib(periods);
+ +              if (runnable)
+ +                      sa->runnable_avg_sum += runnable_contrib;
+ +              sa->runnable_avg_period += runnable_contrib;
+ +      }
+ +
+ +      /* Remainder of delta accrued against u_0` */
+ +      if (runnable)
+ +              sa->runnable_avg_sum += delta;
+ +      sa->runnable_avg_period += delta;
+ +
+ +      return decayed;
+ +}
+ +
+ +/* Synchronize an entity's decay with its parenting cfs_rq.*/
+ +static inline u64 __synchronize_entity_decay(struct sched_entity *se)
+ +{
+ +      struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ +      u64 decays = atomic64_read(&cfs_rq->decay_counter);
+ +
+ +      decays -= se->avg.decay_count;
+ +      if (!decays)
+ +              return 0;
+ +
+ +      se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
+ +      se->avg.decay_count = 0;
+ +
+ +      return decays;
+ +}
+ +
+ +#ifdef CONFIG_FAIR_GROUP_SCHED
+ +static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
+ +                                               int force_update)
+ +{
+ +      struct task_group *tg = cfs_rq->tg;
+ +      s64 tg_contrib;
+ +
+ +      tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
+ +      tg_contrib -= cfs_rq->tg_load_contrib;
+ +
+ +      if (force_update || abs64(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
+ +              atomic64_add(tg_contrib, &tg->load_avg);
+ +              cfs_rq->tg_load_contrib += tg_contrib;
+ +      }
+ +}
+ +
+ +/*
+ + * Aggregate cfs_rq runnable averages into an equivalent task_group
+ + * representation for computing load contributions.
+ + */
+ +static inline void __update_tg_runnable_avg(struct sched_avg *sa,
+ +                                                struct cfs_rq *cfs_rq)
+ +{
+ +      struct task_group *tg = cfs_rq->tg;
+ +      long contrib;
+ +
+ +      /* The fraction of a cpu used by this cfs_rq */
+ +      contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT,
+ +                        sa->runnable_avg_period + 1);
+ +      contrib -= cfs_rq->tg_runnable_contrib;
+ +
+ +      if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
+ +              atomic_add(contrib, &tg->runnable_avg);
+ +              cfs_rq->tg_runnable_contrib += contrib;
+ +      }
+ +}
+ +
+ +static inline void __update_group_entity_contrib(struct sched_entity *se)
+ +{
+ +      struct cfs_rq *cfs_rq = group_cfs_rq(se);
+ +      struct task_group *tg = cfs_rq->tg;
+ +      int runnable_avg;
+ +
+ +      u64 contrib;
+ +
+ +      contrib = cfs_rq->tg_load_contrib * tg->shares;
+ +      se->avg.load_avg_contrib = div64_u64(contrib,
+ +                                           atomic64_read(&tg->load_avg) + 1);
+ +
+ +      /*
+ +       * For group entities we need to compute a correction term in the case
+ +       * that they are consuming <1 cpu so that we would contribute the same
+ +       * load as a task of equal weight.
+ +       *
+ +       * Explicitly co-ordinating this measurement would be expensive, but
+ +       * fortunately the sum of each cpus contribution forms a usable
+ +       * lower-bound on the true value.
+ +       *
+ +       * Consider the aggregate of 2 contributions.  Either they are disjoint
+ +       * (and the sum represents true value) or they are disjoint and we are
+ +       * understating by the aggregate of their overlap.
+ +       *
+ +       * Extending this to N cpus, for a given overlap, the maximum amount we
+ +       * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
+ +       * cpus that overlap for this interval and w_i is the interval width.
+ +       *
+ +       * On a small machine; the first term is well-bounded which bounds the
+ +       * total error since w_i is a subset of the period.  Whereas on a
+ +       * larger machine, while this first term can be larger, if w_i is the
+ +       * of consequential size guaranteed to see n_i*w_i quickly converge to
+ +       * our upper bound of 1-cpu.
+ +       */
+ +      runnable_avg = atomic_read(&tg->runnable_avg);
+ +      if (runnable_avg < NICE_0_LOAD) {
+ +              se->avg.load_avg_contrib *= runnable_avg;
+ +              se->avg.load_avg_contrib >>= NICE_0_SHIFT;
+ +      }
+ +}
+ +#else
+ +static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
+ +                                               int force_update) {}
+ +static inline void __update_tg_runnable_avg(struct sched_avg *sa,
+ +                                                struct cfs_rq *cfs_rq) {}
+ +static inline void __update_group_entity_contrib(struct sched_entity *se) {}
+ +#endif
+ +
+ +static inline void __update_task_entity_contrib(struct sched_entity *se)
+ +{
+ +      u32 contrib;
+ +
+ +      /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
+ +      contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
+ +      contrib /= (se->avg.runnable_avg_period + 1);
+ +      se->avg.load_avg_contrib = scale_load(contrib);
+ +}
+ +
+ +/* Compute the current contribution to load_avg by se, return any delta */
+ +static long __update_entity_load_avg_contrib(struct sched_entity *se)
+ +{
+ +      long old_contrib = se->avg.load_avg_contrib;
+ +
+ +      if (entity_is_task(se)) {
+ +              __update_task_entity_contrib(se);
+ +      } else {
+ +              __update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
+ +              __update_group_entity_contrib(se);
+ +      }
+ +
+ +      return se->avg.load_avg_contrib - old_contrib;
+ +}
+ +
+ +static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
+ +                                               long load_contrib)
+ +{
+ +      if (likely(load_contrib < cfs_rq->blocked_load_avg))
+ +              cfs_rq->blocked_load_avg -= load_contrib;
+ +      else
+ +              cfs_rq->blocked_load_avg = 0;
+ +}
+ +
+ +static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
+ +
+ +/* Update a sched_entity's runnable average */
+ +static inline void update_entity_load_avg(struct sched_entity *se,
+ +                                        int update_cfs_rq)
+ +{
+ +      struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ +      long contrib_delta;
+ +      u64 now;
+ +
+ +      /*
+ +       * For a group entity we need to use their owned cfs_rq_clock_task() in
+ +       * case they are the parent of a throttled hierarchy.
+ +       */
+ +      if (entity_is_task(se))
+ +              now = cfs_rq_clock_task(cfs_rq);
+ +      else
+ +              now = cfs_rq_clock_task(group_cfs_rq(se));
+ +
+ +      if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))
+ +              return;
+ +
+ +      contrib_delta = __update_entity_load_avg_contrib(se);
+ +
+ +      if (!update_cfs_rq)
+ +              return;
+ +
+ +      if (se->on_rq)
+ +              cfs_rq->runnable_load_avg += contrib_delta;
+ +      else
+ +              subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
+ +}
+ +
+ +/*
+ + * Decay the load contributed by all blocked children and account this so that
+ + * their contribution may appropriately discounted when they wake up.
+ + */
+ +static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
+ +{
+ +      u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
+ +      u64 decays;
+ +
+ +      decays = now - cfs_rq->last_decay;
+ +      if (!decays && !force_update)
+ +              return;
+ +
+ +      if (atomic64_read(&cfs_rq->removed_load)) {
+ +              u64 removed_load = atomic64_xchg(&cfs_rq->removed_load, 0);
+ +              subtract_blocked_load_contrib(cfs_rq, removed_load);
+ +      }
+ +
+ +      if (decays) {
+ +              cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
+ +                                                    decays);
+ +              atomic64_add(decays, &cfs_rq->decay_counter);
+ +              cfs_rq->last_decay = now;
+ +      }
+ +
+ +      __update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
+ +      update_cfs_shares(cfs_rq);
+ +}
+ +
+ +static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
+ +{
+ +      __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable);
+ +      __update_tg_runnable_avg(&rq->avg, &rq->cfs);
+ +}
+ +
+ +/* Add the load generated by se into cfs_rq's child load-average */
+ +static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
+ +                                                struct sched_entity *se,
+ +                                                int wakeup)
+ +{
+ +      /*
+ +       * We track migrations using entity decay_count <= 0, on a wake-up
+ +       * migration we use a negative decay count to track the remote decays
+ +       * accumulated while sleeping.
+ +       */
+ +      if (unlikely(se->avg.decay_count <= 0)) {
+ +              se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task;
+ +              if (se->avg.decay_count) {
+ +                      /*
+ +                       * In a wake-up migration we have to approximate the
+ +                       * time sleeping.  This is because we can't synchronize
+ +                       * clock_task between the two cpus, and it is not
+ +                       * guaranteed to be read-safe.  Instead, we can
+ +                       * approximate this using our carried decays, which are
+ +                       * explicitly atomically readable.
+ +                       */
+ +                      se->avg.last_runnable_update -= (-se->avg.decay_count)
+ +                                                      << 20;
+ +                      update_entity_load_avg(se, 0);
+ +                      /* Indicate that we're now synchronized and on-rq */
+ +                      se->avg.decay_count = 0;
+ +              }
+ +              wakeup = 0;
+ +      } else {
+ +              __synchronize_entity_decay(se);
+ +      }
+ +
+ +      /* migrated tasks did not contribute to our blocked load */
+ +      if (wakeup) {
+ +              subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
+ +              update_entity_load_avg(se, 0);
+ +      }
+ +
+ +      cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
+ +      /* we force update consideration on load-balancer moves */
+ +      update_cfs_rq_blocked_load(cfs_rq, !wakeup);
+ +}
+ +
+ +/*
+ + * Remove se's load from this cfs_rq child load-average, if the entity is
+ + * transitioning to a blocked state we track its projected decay using
+ + * blocked_load_avg.
+ + */
+ +static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
+ +                                                struct sched_entity *se,
+ +                                                int sleep)
+ +{
+ +      update_entity_load_avg(se, 1);
+ +      /* we force update consideration on load-balancer moves */
+ +      update_cfs_rq_blocked_load(cfs_rq, !sleep);
+ +
+ +      cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
+ +      if (sleep) {
+ +              cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
+ +              se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
+ +      } /* migrations, e.g. sleep=0 leave decay_count == 0 */
+ +}
+ +#else
+ +static inline void update_entity_load_avg(struct sched_entity *se,
+ +                                        int update_cfs_rq) {}
+ +static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
+ +static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
+ +                                         struct sched_entity *se,
+ +                                         int wakeup) {}
+ +static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
+ +                                         struct sched_entity *se,
+ +                                         int sleep) {}
+ +static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
+ +                                            int force_update) {}
+ +#endif
   
   static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
   {
@@@ -1475,8 -1096,9 +1475,8 @@@ enqueue_entity(struct cfs_rq *cfs_rq, s
          * Update run-time statistics of the 'current'.
          */
         update_curr(cfs_rq);
- -      update_cfs_load(cfs_rq, 0);
         account_entity_enqueue(cfs_rq, se);
- -      update_cfs_shares(cfs_rq);
+ +      enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
   
         if (flags & ENQUEUE_WAKEUP) {
                 place_entity(cfs_rq, se, 0);
@@@ -1568,8 -1190,9 +1568,8 @@@ dequeue_entity(struct cfs_rq *cfs_rq, s
   
         if (se != cfs_rq->curr)
                 __dequeue_entity(cfs_rq, se);
- -      se->on_rq = 0;
- -      update_cfs_load(cfs_rq, 0);
         account_entity_dequeue(cfs_rq, se);
+ +      dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
   
         /*
          * Normalize the entity after updating the min_vruntime because the
@@@ -1583,7 -1206,7 +1583,7 @@@
         return_cfs_rq_runtime(cfs_rq);
   
         update_min_vruntime(cfs_rq);
- -      update_cfs_shares(cfs_rq);
+ +      se->on_rq = 0;
   }
   
   /*
@@@ -1717,8 -1340,6 +1717,8 @@@ static void put_prev_entity(struct cfs_
                 update_stats_wait_start(cfs_rq, prev);
                 /* Put 'current' back into the tree. */
                 __enqueue_entity(cfs_rq, prev);
+ +              /* in !on_rq case, update occurred at dequeue */
+ +              update_entity_load_avg(prev, 1);
         }
         cfs_rq->curr = NULL;
   }
@@@ -1732,10 -1353,9 +1732,10 @@@ entity_tick(struct cfs_rq *cfs_rq, stru
         update_curr(cfs_rq);
   
         /*
- -       * Update share accounting for long-running entities.
+ +       * Ensure that runnable average is periodically updated.
          */
- -      update_entity_shares_tick(cfs_rq);
+ +      update_entity_load_avg(curr, 1);
+ +      update_cfs_rq_blocked_load(cfs_rq, 1);
   
   #ifdef CONFIG_SCHED_HRTICK
         /*
@@@ -1828,15 -1448,6 +1828,15 @@@ static inline struct cfs_bandwidth *tg_
         return &tg->cfs_bandwidth;
   }
   
+ +/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
+ +static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
+ +{
+ +      if (unlikely(cfs_rq->throttle_count))
+ +              return cfs_rq->throttled_clock_task;
+ +
+ +      return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time;
+ +}
+ +
   /* returns 0 on failure to allocate runtime */
   static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
   {
@@@ -1981,9 -1592,14 +1981,9 @@@ static int tg_unthrottle_up(struct task
         cfs_rq->throttle_count--;
   #ifdef CONFIG_SMP
         if (!cfs_rq->throttle_count) {
- -              u64 delta = rq->clock_task - cfs_rq->load_stamp;
- -
- -              /* leaving throttled state, advance shares averaging windows */
- -              cfs_rq->load_stamp += delta;
- -              cfs_rq->load_last += delta;
- -
- -              /* update entity weight now that we are on_rq again */
- -              update_cfs_shares(cfs_rq);
+ +              /* adjust cfs_rq_clock_task() */
+ +              cfs_rq->throttled_clock_task_time += rq->clock_task -
+ +                                           cfs_rq->throttled_clock_task;
         }
   #endif
   
@@@ -1995,9 -1611,9 +1995,9 @@@ static int tg_throttle_down(struct task
         struct rq *rq = data;
         struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
   
- -      /* group is entering throttled state, record last load */
+ +      /* group is entering throttled state, stop time */
         if (!cfs_rq->throttle_count)
- -              update_cfs_load(cfs_rq, 0);
+ +              cfs_rq->throttled_clock_task = rq->clock_task;
         cfs_rq->throttle_count++;
   
         return 0;
@@@ -2012,7 -1628,7 +2012,7 @@@ static void throttle_cfs_rq(struct cfs_
   
         se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
   
- -      /* account load preceding throttle */
+ +      /* freeze hierarchy runnable averages while throttled */
         rcu_read_lock();
         walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
         rcu_read_unlock();
@@@ -2036,7 -1652,7 +2036,7 @@@
                 rq->nr_running -= task_delta;
   
         cfs_rq->throttled = 1;
- -      cfs_rq->throttled_timestamp = rq->clock;
+ +      cfs_rq->throttled_clock = rq->clock;
         raw_spin_lock(&cfs_b->lock);
         list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
         raw_spin_unlock(&cfs_b->lock);
@@@ -2054,9 -1670,10 +2054,9 @@@ void unthrottle_cfs_rq(struct cfs_rq *c
   
         cfs_rq->throttled = 0;
         raw_spin_lock(&cfs_b->lock);
- -      cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp;
+ +      cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock;
         list_del_rcu(&cfs_rq->throttled_list);
         raw_spin_unlock(&cfs_b->lock);
- -      cfs_rq->throttled_timestamp = 0;
   
         update_rq_clock(rq);
         /* update hierarchical throttle state */
@@@ -2456,13 -2073,8 +2456,13 @@@ static void unthrottle_offline_cfs_rqs(
   }
   
   #else /* CONFIG_CFS_BANDWIDTH */
- -static __always_inline
- -void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) {}
+ +static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
+ +{
+ +      return rq_of(cfs_rq)->clock_task;
+ +}
+ +
+ +static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+ +                                   unsigned long delta_exec) {}
   static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
   static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
   static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
@@@ -2595,14 -2207,12 +2595,14 @@@ enqueue_task_fair(struct rq *rq, struc
                 if (cfs_rq_throttled(cfs_rq))
                         break;
   
- -              update_cfs_load(cfs_rq, 0);
- -              update_cfs_shares(cfs_rq);
+ +              update_entity_load_avg(se, 1);
+ +              update_cfs_rq_blocked_load(cfs_rq, 0);
         }
   
- -      if (!se)
+ +      if (!se) {
+ +              update_rq_runnable_avg(rq, rq->nr_running);
                 inc_nr_running(rq);
+ +      }
         hrtick_update(rq);
   }
   
@@@ -2656,14 -2266,12 +2656,14 @@@ static void dequeue_task_fair(struct r
                 if (cfs_rq_throttled(cfs_rq))
                         break;
   
- -              update_cfs_load(cfs_rq, 0);
- -              update_cfs_shares(cfs_rq);
+ +              update_entity_load_avg(se, 1);
+ +              update_cfs_rq_blocked_load(cfs_rq, 0);
         }
   
- -      if (!se)
+ +      if (!se) {
                 dec_nr_running(rq);
+ +              update_rq_runnable_avg(rq, 1);
+ +      }
         hrtick_update(rq);
   }
   
@@@ -3173,37 -2781,6 +3173,37 @@@ unlock
   
         return new_cpu;
   }
+ +
+ +/*
+ + * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
+ + * removed when useful for applications beyond shares distribution (e.g.
+ + * load-balance).
+ + */
+ +#ifdef CONFIG_FAIR_GROUP_SCHED
+ +/*
+ + * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
+ + * cfs_rq_of(p) references at time of call are still valid and identify the
+ + * previous cpu.  However, the caller only guarantees p->pi_lock is held; no
+ + * other assumptions, including the state of rq->lock, should be made.
+ + */
+ +static void
+ +migrate_task_rq_fair(struct task_struct *p, int next_cpu)
+ +{
+ +      struct sched_entity *se = &p->se;
+ +      struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ +
+ +      /*
+ +       * Load tracking: accumulate removed load so that it can be processed
+ +       * when we next update owning cfs_rq under rq->lock.  Tasks contribute
+ +       * to blocked load iff they have a positive decay-count.  It can never
+ +       * be negative here since on-rq tasks have decay-count == 0.
+ +       */
+ +      if (se->avg.decay_count) {
+ +              se->avg.decay_count = -__synchronize_entity_decay(se);
+ +              atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load);
+ +      }
+ +}
+ +#endif
   #endif /* CONFIG_SMP */
   
   static unsigned long
@@@ -3330,7 -2907,7 +3330,7 @@@ static void check_preempt_wakeup(struc
          * Batch and idle tasks do not preempt non-idle tasks (their preemption
          * is driven by the tick):
          */
-       if (unlikely(p->policy != SCHED_NORMAL))
+       if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
                 return;
   
         find_matching_se(&se, &pse);
@@@ -3456,122 -3033,8 +3456,122 @@@ static bool yield_to_task_fair(struct r
   
   #ifdef CONFIG_SMP
   /**************************************************
- - * Fair scheduling class load-balancing methods:
- - */
+ + * Fair scheduling class load-balancing methods.
+ + *
+ + * BASICS
+ + *
+ + * The purpose of load-balancing is to achieve the same basic fairness the
+ + * per-cpu scheduler provides, namely provide a proportional amount of compute
+ + * time to each task. This is expressed in the following equation:
+ + *
+ + *   W_i,n/P_i == W_j,n/P_j for all i,j                               (1)
+ + *
+ + * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
+ + * W_i,0 is defined as:
+ + *
+ + *   W_i,0 = \Sum_j w_i,j                                             (2)
+ + *
+ + * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
+ + * is derived from the nice value as per prio_to_weight[].
+ + *
+ + * The weight average is an exponential decay average of the instantaneous
+ + * weight:
+ + *
+ + *   W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0               (3)
+ + *
+ + * P_i is the cpu power (or compute capacity) of cpu i, typically it is the
+ + * fraction of 'recent' time available for SCHED_OTHER task execution. But it
+ + * can also include other factors [XXX].
+ + *
+ + * To achieve this balance we define a measure of imbalance which follows
+ + * directly from (1):
+ + *
+ + *   imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j }    (4)
+ + *
+ + * We them move tasks around to minimize the imbalance. In the continuous
+ + * function space it is obvious this converges, in the discrete case we get
+ + * a few fun cases generally called infeasible weight scenarios.
+ + *
+ + * [XXX expand on:
+ + *     - infeasible weights;
+ + *     - local vs global optima in the discrete case. ]
+ + *
+ + *
+ + * SCHED DOMAINS
+ + *
+ + * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
+ + * for all i,j solution, we create a tree of cpus that follows the hardware
+ + * topology where each level pairs two lower groups (or better). This results
+ + * in O(log n) layers. Furthermore we reduce the number of cpus going up the
+ + * tree to only the first of the previous level and we decrease the frequency
+ + * of load-balance at each level inv. proportional to the number of cpus in
+ + * the groups.
+ + *
+ + * This yields:
+ + *
+ + *     log_2 n     1     n
+ + *   \Sum       { --- * --- * 2^i } = O(n)                            (5)
+ + *     i = 0      2^i   2^i
+ + *                               `- size of each group
+ + *         |         |     `- number of cpus doing load-balance
+ + *         |         `- freq
+ + *         `- sum over all levels
+ + *
+ + * Coupled with a limit on how many tasks we can migrate every balance pass,
+ + * this makes (5) the runtime complexity of the balancer.
+ + *
+ + * An important property here is that each CPU is still (indirectly) connected
+ + * to every other cpu in at most O(log n) steps:
+ + *
+ + * The adjacency matrix of the resulting graph is given by:
+ + *
+ + *             log_2 n     
+ + *   A_i,j = \Union     (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1)  (6)
+ + *             k = 0
+ + *
+ + * And you'll find that:
+ + *
+ + *   A^(log_2 n)_i,j != 0  for all i,j                                (7)
+ + *
+ + * Showing there's indeed a path between every cpu in at most O(log n) steps.
+ + * The task movement gives a factor of O(m), giving a convergence complexity
+ + * of:
+ + *
+ + *   O(nm log n),  n := nr_cpus, m := nr_tasks                        (8)
+ + *
+ + *
+ + * WORK CONSERVING
+ + *
+ + * In order to avoid CPUs going idle while there's still work to do, new idle
+ + * balancing is more aggressive and has the newly idle cpu iterate up the domain
+ + * tree itself instead of relying on other CPUs to bring it work.
+ + *
+ + * This adds some complexity to both (5) and (8) but it reduces the total idle
+ + * time.
+ + *
+ + * [XXX more?]
+ + *
+ + *
+ + * CGROUPS
+ + *
+ + * Cgroups make a horror show out of (2), instead of a simple sum we get:
+ + *
+ + *                                s_k,i
+ + *   W_i,0 = \Sum_j \Prod_k w_k * -----                               (9)
+ + *                                 S_k
+ + *
+ + * Where
+ + *
+ + *   s_k,i = \Sum_j w_i,j,k  and  S_k = \Sum_i s_k,i                 (10)
+ + *
+ + * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
+ + *
+ + * The big problem is S_k, its a global sum needed to compute a local (W_i)
+ + * property.
+ + *
+ + * [XXX write more on how we solve this.. _after_ merging pjt's patches that
+ + *      rewrite all of this once again.]
+ + */ 
   
   static unsigned long __read_mostly max_load_balance_interval = HZ/10;
   
@@@ -3837,58 -3300,52 +3837,58 @@@ next
   /*
    * update tg->load_weight by folding this cpu's load_avg
    */
- -static int update_shares_cpu(struct task_group *tg, int cpu)
+ +static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
   {
- -      struct cfs_rq *cfs_rq;
- -      unsigned long flags;
- -      struct rq *rq;
- -
- -      if (!tg->se[cpu])
- -              return 0;
- -
- -      rq = cpu_rq(cpu);
- -      cfs_rq = tg->cfs_rq[cpu];
- -
- -      raw_spin_lock_irqsave(&rq->lock, flags);
+ +      struct sched_entity *se = tg->se[cpu];
+ +      struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
   
- -      update_rq_clock(rq);
- -      update_cfs_load(cfs_rq, 1);
+ +      /* throttled entities do not contribute to load */
+ +      if (throttled_hierarchy(cfs_rq))
+ +              return;
   
- -      /*
- -       * We need to update shares after updating tg->load_weight in
- -       * order to adjust the weight of groups with long running tasks.
- -       */
- -      update_cfs_shares(cfs_rq);
+ +      update_cfs_rq_blocked_load(cfs_rq, 1);
   
- -      raw_spin_unlock_irqrestore(&rq->lock, flags);
- -
- -      return 0;
+ +      if (se) {
+ +              update_entity_load_avg(se, 1);
+ +              /*
+ +               * We pivot on our runnable average having decayed to zero for
+ +               * list removal.  This generally implies that all our children
+ +               * have also been removed (modulo rounding error or bandwidth
+ +               * control); however, such cases are rare and we can fix these
+ +               * at enqueue.
+ +               *
+ +               * TODO: fix up out-of-order children on enqueue.
+ +               */
+ +              if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)
+ +                      list_del_leaf_cfs_rq(cfs_rq);
+ +      } else {
+ +              struct rq *rq = rq_of(cfs_rq);
+ +              update_rq_runnable_avg(rq, rq->nr_running);
+ +      }
   }
   
- -static void update_shares(int cpu)
+ +static void update_blocked_averages(int cpu)
   {
- -      struct cfs_rq *cfs_rq;
         struct rq *rq = cpu_rq(cpu);
+ +      struct cfs_rq *cfs_rq;
+ +      unsigned long flags;
   
- -      rcu_read_lock();
+ +      raw_spin_lock_irqsave(&rq->lock, flags);
+ +      update_rq_clock(rq);
         /*
          * Iterates the task_group tree in a bottom up fashion, see
          * list_add_leaf_cfs_rq() for details.
          */
         for_each_leaf_cfs_rq(rq, cfs_rq) {
- -              /* throttled entities do not contribute to load */
- -              if (throttled_hierarchy(cfs_rq))
- -                      continue;
- -
- -              update_shares_cpu(cfs_rq->tg, cpu);
+ +              /*
+ +               * Note: We may want to consider periodically releasing
+ +               * rq->lock about these updates so that creating many task
+ +               * groups does not result in continually extending hold time.
+ +               */
+ +              __update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
         }
- -      rcu_read_unlock();
+ +
+ +      raw_spin_unlock_irqrestore(&rq->lock, flags);
   }
   
   /*
@@@ -3940,7 -3397,7 +3940,7 @@@ static unsigned long task_h_load(struc
         return load;
   }
   #else
- -static inline void update_shares(int cpu)
+ +static inline void update_blocked_averages(int cpu)
   {
   }
   
@@@ -5000,14 -4457,12 +5000,14 @@@ void idle_balance(int this_cpu, struct 
         if (this_rq->avg_idle < sysctl_sched_migration_cost)
                 return;
   
+ +      update_rq_runnable_avg(this_rq, 1);
+ +
         /*
          * Drop the rq->lock, but keep IRQ/preempt disabled.
          */
         raw_spin_unlock(&this_rq->lock);
   
- -      update_shares(this_cpu);
+ +      update_blocked_averages(this_cpu);
         rcu_read_lock();
         for_each_domain(this_cpu, sd) {
                 unsigned long interval;
@@@ -5262,7 -4717,7 +5262,7 @@@ static void rebalance_domains(int cpu, 
         int update_next_balance = 0;
         int need_serialize;
   
- -      update_shares(cpu);
+ +      update_blocked_averages(cpu);
   
         rcu_read_lock();
         for_each_domain(cpu, sd) {
@@@ -5499,8 -4954,6 +5499,8 @@@ static void task_tick_fair(struct rq *r
                 cfs_rq = cfs_rq_of(se);
                 entity_tick(cfs_rq, se, queued);
         }
+ +
+ +      update_rq_runnable_avg(rq, 1);
   }
   
   /*
@@@ -5593,20 -5046,6 +5593,20 @@@ static void switched_from_fair(struct r
                 place_entity(cfs_rq, se, 0);
                 se->vruntime -= cfs_rq->min_vruntime;
         }
+ +
+ +#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
+ +      /*
+ +      * Remove our load from contribution when we leave sched_fair
+ +      * and ensure we don't carry in an old decay_count if we
+ +      * switch back.
+ +      */
+ +      if (p->se.avg.decay_count) {
+ +              struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
+ +              __synchronize_entity_decay(&p->se);
+ +              subtract_blocked_load_contrib(cfs_rq,
+ +                              p->se.avg.load_avg_contrib);
+ +      }
+ +#endif
   }
   
   /*
@@@ -5653,16 -5092,11 +5653,16 @@@ void init_cfs_rq(struct cfs_rq *cfs_rq
   #ifndef CONFIG_64BIT
         cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
   #endif
+ +#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
+ +      atomic64_set(&cfs_rq->decay_counter, 1);
+ +      atomic64_set(&cfs_rq->removed_load, 0);
+ +#endif
   }
   
   #ifdef CONFIG_FAIR_GROUP_SCHED
   static void task_move_group_fair(struct task_struct *p, int on_rq)
   {
+ +      struct cfs_rq *cfs_rq;
         /*
          * If the task was not on the rq at the time of this cgroup movement
          * it must have been asleep, sleeping tasks keep their ->vruntime
@@@ -5694,19 -5128,8 +5694,19 @@@
         if (!on_rq)
                 p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
         set_task_rq(p, task_cpu(p));
- -      if (!on_rq)
- -              p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
+ +      if (!on_rq) {
+ +              cfs_rq = cfs_rq_of(&p->se);
+ +              p->se.vruntime += cfs_rq->min_vruntime;
+ +#ifdef CONFIG_SMP
+ +              /*
+ +               * migrate_task_rq_fair() will have removed our previous
+ +               * contribution, but we must synchronize for ongoing future
+ +               * decay.
+ +               */
+ +              p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
+ +              cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
+ +#endif
+ +      }
   }
   
   void free_fair_sched_group(struct task_group *tg)
@@@ -5791,6 -5214,10 +5791,6 @@@ void init_tg_cfs_entry(struct task_grou
   
         cfs_rq->tg = tg;
         cfs_rq->rq = rq;
- -#ifdef CONFIG_SMP
- -      /* allow initial update_cfs_load() to truncate */
- -      cfs_rq->load_stamp = 1;
- -#endif
         init_cfs_rq_runtime(cfs_rq);
   
         tg->cfs_rq[cpu] = cfs_rq;
@@@ -5837,11 -5264,8 +5837,11 @@@ int sched_group_set_shares(struct task_
                 se = tg->se[i];
                 /* Propagate contribution to hierarchy */
                 raw_spin_lock_irqsave(&rq->lock, flags);
- -              for_each_sched_entity(se)
+ +              for_each_sched_entity(se) {
                         update_cfs_shares(group_cfs_rq(se));
+ +                      /* update contribution to parent */
+ +                      update_entity_load_avg(se, 1);
+ +              }
                 raw_spin_unlock_irqrestore(&rq->lock, flags);
         }
   
@@@ -5895,9 -5319,7 +5895,9 @@@ const struct sched_class fair_sched_cla
   
   #ifdef CONFIG_SMP
         .select_task_rq         = select_task_rq_fair,
- -
+ +#ifdef CONFIG_FAIR_GROUP_SCHED
+ +      .migrate_task_rq        = migrate_task_rq_fair,
+ +#endif
         .rq_online              = rq_online_fair,
         .rq_offline             = rq_offline_fair,
   
diff --combined kernel/sysctl.c

index 26f65eaa01f9c94366aa5156f9c304bbc589808f,2914d0f752cf9f7493ef44415e94aacf232c721c..b0fa5ad09873f874775cc72ffd534bfec825f639
--- 1/kernel/sysctl.c
--- 2/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@@ -97,12 -97,10 +97,12 @@@
   extern int sysctl_overcommit_memory;
   extern int sysctl_overcommit_ratio;
   extern int max_threads;
- -extern int core_uses_pid;
   extern int suid_dumpable;
+ +#ifdef CONFIG_COREDUMP
+ +extern int core_uses_pid;
   extern char core_pattern[];
   extern unsigned int core_pipe_limit;
+ +#endif
   extern int pid_max;
   extern int min_free_kbytes;
   extern int pid_max_min, pid_max_max;
@@@ -179,10 -177,8 +179,10 @@@ static int proc_dointvec_minmax_sysadmi
   
   static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
                 void __user *buffer, size_t *lenp, loff_t *ppos);
+ +#ifdef CONFIG_COREDUMP
   static int proc_dostring_coredump(struct ctl_table *table, int write,
                 void __user *buffer, size_t *lenp, loff_t *ppos);
+ +#endif
   
   #ifdef CONFIG_MAGIC_SYSRQ
   /* Note: sysrq code uses it's own private copy */
@@@ -367,10 -363,8 +367,8 @@@ static struct ctl_table kern_table[] = 
                 .procname       = "sched_autogroup_enabled",
                 .data           = &sysctl_sched_autogroup_enabled,
                 .maxlen         = sizeof(unsigned int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec_minmax,
-               .extra1         = &zero,
-               .extra2         = &one,
+               .mode           = 0444,
+               .proc_handler   = proc_dointvec,
         },
   #endif
   #ifdef CONFIG_CFS_BANDWIDTH
@@@ -408,7 -402,6 +406,7 @@@
                 .mode           = 0644,
                 .proc_handler   = proc_dointvec,
         },
+ +#ifdef CONFIG_COREDUMP
         {
                 .procname       = "core_uses_pid",
                 .data           = &core_uses_pid,
@@@ -430,7 -423,6 +428,7 @@@
                 .mode           = 0644,
                 .proc_handler   = proc_dointvec,
         },
+ +#endif
   #ifdef CONFIG_PROC_SYSCTL
         {
                 .procname       = "tainted",
@@@ -1549,7 -1541,8 +1547,7 @@@ static struct ctl_table fs_table[] = 
   };
   
   static struct ctl_table debug_table[] = {
- -#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \
- -    defined(CONFIG_S390) || defined(CONFIG_TILE)
+ +#ifdef CONFIG_SYSCTL_EXCEPTION_TRACE
         {
                 .procname       = "exception-trace",
                 .data           = &show_unhandled_signals,
@@@ -2041,14 -2034,12 +2039,14 @@@ int proc_dointvec_minmax(struct ctl_tab
   
   static void validate_coredump_safety(void)
   {
+ +#ifdef CONFIG_COREDUMP
         if (suid_dumpable == SUID_DUMPABLE_SAFE &&
             core_pattern[0] != '/' && core_pattern[0] != '|') {
                 printk(KERN_WARNING "Unsafe core_pattern used with "\
                         "suid_dumpable=2. Pipe handler or fully qualified "\
                         "core dump path required.\n");
         }
+ +#endif
   }
   
   static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
@@@ -2060,7 -2051,6 +2058,7 @@@
         return error;
   }
   
+ +#ifdef CONFIG_COREDUMP
   static int proc_dostring_coredump(struct ctl_table *table, int write,
                   void __user *buffer, size_t *lenp, loff_t *ppos)
   {
@@@ -2069,7 -2059,6 +2067,7 @@@
                 validate_coredump_safety();
         return error;
   }
+ +#endif
   
   static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write,
                                      void __user *buffer,
author	Ingo Molnar <mingo@kernel.org>
	Sun, 18 Nov 2012 08:34:44 +0000 (09:34 +0100)
committer	Ingo Molnar <mingo@kernel.org>
	Sun, 18 Nov 2012 08:34:44 +0000 (09:34 +0100)
		1	2
fs/proc/base.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/fair.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sysctl.c	patch \|	diff1 \|	diff2 \|	blob \| history