Merge branch 'perf/core' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux...
authorSteven Rostedt <srostedt@redhat.com>
Tue, 18 May 2010 02:26:53 +0000 (22:26 -0400)
committerSteven Rostedt <rostedt@goodmis.org>
Tue, 18 May 2010 04:35:23 +0000 (00:35 -0400)
Conflicts:
include/trace/ftrace.h
kernel/trace/trace_kprobe.c

Acked-by: Masami Hiramatsu <mhiramat@redhat.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
81 files changed:
Documentation/RCU/torture.txt
Documentation/kernel-parameters.txt
Documentation/scheduler/sched-design-CFS.txt
Documentation/scheduler/sched-rt-group.txt
Documentation/trace/events.txt
Documentation/trace/ftrace.txt
arch/s390/kernel/time.c
drivers/char/sysrq.c
drivers/cpufreq/cpufreq_ondemand.c
drivers/oprofile/cpu_buffer.c
drivers/xen/manage.c
fs/eventpoll.c
include/linux/cpuset.h
include/linux/ftrace.h
include/linux/ftrace_event.h
include/linux/kernel.h
include/linux/module.h
include/linux/rcutiny.h
include/linux/rcutree.h
include/linux/ring_buffer.h
include/linux/sched.h
include/linux/stop_machine.h
include/linux/syscalls.h
include/linux/tick.h
include/linux/tracepoint.h
include/linux/wait.h
include/trace/define_trace.h
include/trace/events/module.h
include/trace/events/napi.h
include/trace/events/sched.h
include/trace/events/signal.h
include/trace/ftrace.h
include/trace/syscall.h
init/Kconfig
kernel/Makefile
kernel/capability.c
kernel/cgroup.c
kernel/cpu.c
kernel/cpuset.c
kernel/cred-internals.h [deleted file]
kernel/cred.c
kernel/exit.c
kernel/module.c
kernel/rcutorture.c
kernel/sched.c
kernel/sched_debug.c
kernel/sched_fair.c
kernel/sched_features.h
kernel/sched_idletask.c
kernel/sched_rt.c
kernel/stop_machine.c
kernel/time/tick-sched.c
kernel/time/timer_list.c
kernel/trace/blktrace.c
kernel/trace/ftrace.c
kernel/trace/kmemtrace.c
kernel/trace/ring_buffer.c
kernel/trace/ring_buffer_benchmark.c
kernel/trace/trace.c
kernel/trace/trace.h
kernel/trace/trace_branch.c
kernel/trace/trace_event_perf.c
kernel/trace/trace_events.c
kernel/trace/trace_events_filter.c
kernel/trace/trace_export.c
kernel/trace/trace_functions_graph.c
kernel/trace/trace_irqsoff.c
kernel/trace/trace_kprobe.c
kernel/trace/trace_output.c
kernel/trace/trace_output.h
kernel/trace/trace_sched_switch.c
kernel/trace/trace_sched_wakeup.c
kernel/trace/trace_selftest.c
kernel/trace/trace_syscalls.c
kernel/trace/trace_workqueue.c
kernel/tracepoint.c
kernel/user.c
net/core/drop_monitor.c
samples/tracepoints/tp-samples-trace.h
samples/tracepoints/tracepoint-probe-sample.c
samples/tracepoints/tracepoint-probe-sample2.c

index 0e50bc2aa1e2e4f4682f5b5540a49987bfb00884..5d9016795fd825a43f2a4093e4edd870cb92e42c 100644 (file)
@@ -182,16 +182,6 @@ Similarly, sched_expedited RCU provides the following:
        sched_expedited-torture: Reader Pipe:  12660320201 95875 0 0 0 0 0 0 0 0 0
        sched_expedited-torture: Reader Batch:  12660424885 0 0 0 0 0 0 0 0 0 0
        sched_expedited-torture: Free-Block Circulation:  1090795 1090795 1090794 1090793 1090792 1090791 1090790 1090789 1090788 1090787 0
-       state: -1 / 0:0 3:0 4:0
-
-As before, the first four lines are similar to those for RCU.
-The last line shows the task-migration state.  The first number is
--1 if synchronize_sched_expedited() is idle, -2 if in the process of
-posting wakeups to the migration kthreads, and N when waiting on CPU N.
-Each of the colon-separated fields following the "/" is a CPU:state pair.
-Valid states are "0" for idle, "1" for waiting for quiescent state,
-"2" for passed through quiescent state, and "3" when a race with a
-CPU-hotplug event forces use of the synchronize_sched() primitive.
 
 
 USAGE
index 839b21b0699ac10a1991c47455cddd05ced6491b..907010cea9ada01dcfc301f1ad74cad870e777f8 100644 (file)
@@ -784,8 +784,12 @@ and is between 256 and 4096 characters. It is defined in the file
                        as early as possible in order to facilitate early
                        boot debugging.
 
-       ftrace_dump_on_oops
+       ftrace_dump_on_oops[=orig_cpu]
                        [FTRACE] will dump the trace buffers on oops.
+                       If no parameter is passed, ftrace will dump
+                       buffers of all CPUs, but if you pass orig_cpu, it will
+                       dump only the buffer of the CPU that triggered the
+                       oops.
 
        ftrace_filter=[function-list]
                        [FTRACE] Limit the functions traced by the function
index 6f33593e59e21d898fa38fd59ce297a438dfbdf1..8239ebbcddce1d9b84689b8e1be530243bee5f83 100644 (file)
@@ -211,7 +211,7 @@ provide fair CPU time to each such task group.  For example, it may be
 desirable to first provide fair CPU time to each user on the system and then to
 each task belonging to a user.
 
-CONFIG_GROUP_SCHED strives to achieve exactly that.  It lets tasks to be
+CONFIG_CGROUP_SCHED strives to achieve exactly that.  It lets tasks to be
 grouped and divides CPU time fairly among such groups.
 
 CONFIG_RT_GROUP_SCHED permits to group real-time (i.e., SCHED_FIFO and
@@ -220,38 +220,11 @@ SCHED_RR) tasks.
 CONFIG_FAIR_GROUP_SCHED permits to group CFS (i.e., SCHED_NORMAL and
 SCHED_BATCH) tasks.
 
-At present, there are two (mutually exclusive) mechanisms to group tasks for
-CPU bandwidth control purposes:
-
- - Based on user id (CONFIG_USER_SCHED)
-
-   With this option, tasks are grouped according to their user id.
-
- - Based on "cgroup" pseudo filesystem (CONFIG_CGROUP_SCHED)
-
-   This options needs CONFIG_CGROUPS to be defined, and lets the administrator
+   These options need CONFIG_CGROUPS to be defined, and let the administrator
    create arbitrary groups of tasks, using the "cgroup" pseudo filesystem.  See
    Documentation/cgroups/cgroups.txt for more information about this filesystem.
 
-Only one of these options to group tasks can be chosen and not both.
-
-When CONFIG_USER_SCHED is defined, a directory is created in sysfs for each new
-user and a "cpu_share" file is added in that directory.
-
-       # cd /sys/kernel/uids
-       # cat 512/cpu_share             # Display user 512's CPU share
-       1024
-       # echo 2048 > 512/cpu_share     # Modify user 512's CPU share
-       # cat 512/cpu_share             # Display user 512's CPU share
-       2048
-       #
-
-CPU bandwidth between two users is divided in the ratio of their CPU shares.
-For example: if you would like user "root" to get twice the bandwidth of user
-"guest," then set the cpu_share for both the users such that "root"'s cpu_share
-is twice "guest"'s cpu_share.
-
-When CONFIG_CGROUP_SCHED is defined, a "cpu.shares" file is created for each
+When CONFIG_FAIR_GROUP_SCHED is defined, a "cpu.shares" file is created for each
 group created using the pseudo filesystem.  See example steps below to create
 task groups and modify their CPU share using the "cgroups" pseudo filesystem.
 
@@ -273,24 +246,3 @@ task groups and modify their CPU share using the "cgroups" pseudo filesystem.
 
        # #Launch gmplayer (or your favourite movie player)
        # echo <movie_player_pid> > multimedia/tasks
-
-8. Implementation note: user namespaces
-
-User namespaces are intended to be hierarchical.  But they are currently
-only partially implemented.  Each of those has ramifications for CFS.
-
-First, since user namespaces are hierarchical, the /sys/kernel/uids
-presentation is inadequate.  Eventually we will likely want to use sysfs
-tagging to provide private views of /sys/kernel/uids within each user
-namespace.
-
-Second, the hierarchical nature is intended to support completely
-unprivileged use of user namespaces.  So if using user groups, then
-we want the users in a user namespace to be children of the user
-who created it.
-
-That is currently unimplemented.  So instead, every user in a new
-user namespace will receive 1024 shares just like any user in the
-initial user namespace.  Note that at the moment creation of a new
-user namespace requires each of CAP_SYS_ADMIN, CAP_SETUID, and
-CAP_SETGID.
index 86eabe6c3419fd4f26aba47a090ecc1caff9a1fd..605b0d40329d843f6c3b838cd4afa5e38438d31e 100644 (file)
@@ -126,23 +126,12 @@ priority!
 2.3 Basis for grouping tasks
 ----------------------------
 
-There are two compile-time settings for allocating CPU bandwidth. These are
-configured using the "Basis for grouping tasks" multiple choice menu under
-General setup > Group CPU Scheduler:
-
-a. CONFIG_USER_SCHED (aka "Basis for grouping tasks" =  "user id")
-
-This lets you use the virtual files under
-"/sys/kernel/uids/<uid>/cpu_rt_runtime_us" to control he CPU time reserved for
-each user .
-
-The other option is:
-
-.o CONFIG_CGROUP_SCHED (aka "Basis for grouping tasks" = "Control groups")
+Enabling CONFIG_RT_GROUP_SCHED lets you explicitly allocate real
+CPU bandwidth to task groups.
 
 This uses the /cgroup virtual file system and
 "/cgroup/<cgroup>/cpu.rt_runtime_us" to control the CPU time reserved for each
-control group instead.
+control group.
 
 For more information on working with control groups, you should read
 Documentation/cgroups/cgroups.txt as well.
@@ -161,8 +150,7 @@ For now, this can be simplified to just the following (but see Future plans):
 ===============
 
 There is work in progress to make the scheduling period for each group
-("/sys/kernel/uids/<uid>/cpu_rt_period_us" or
-"/cgroup/<cgroup>/cpu.rt_period_us" respectively) configurable as well.
+("/cgroup/<cgroup>/cpu.rt_period_us") configurable as well.
 
 The constraint on the period is that a subgroup must have a smaller or
 equal period to its parent. But realistically its not very useful _yet_
index 02ac6ed38b2d0ba73c44298484b7d7bac540dae2..778ddf38b82cab70ecacf5739f723cfd45a64673 100644 (file)
@@ -90,7 +90,8 @@ In order to facilitate early boot debugging, use boot option:
 
        trace_event=[event-list]
 
-The format of this boot option is the same as described in section 2.1.
+event-list is a comma separated list of events. See section 2.1 for event
+format.
 
 3. Defining an event-enabled tracepoint
 =======================================
index 03485bfbd7975792f50d54dabf77533938f255c9..557c1edeccaf72535464298743cddd3c8eb01bea 100644 (file)
@@ -155,6 +155,9 @@ of ftrace. Here is a list of some of the key files:
        to be traced. Echoing names of functions into this file
        will limit the trace to only those functions.
 
+       This interface also allows for commands to be used. See the
+       "Filter commands" section for more details.
+
   set_ftrace_notrace:
 
        This has an effect opposite to that of
@@ -1337,12 +1340,14 @@ ftrace_dump_on_oops must be set. To set ftrace_dump_on_oops, one
 can either use the sysctl function or set it via the proc system
 interface.
 
-  sysctl kernel.ftrace_dump_on_oops=1
+  sysctl kernel.ftrace_dump_on_oops=n
 
 or
 
-  echo 1 > /proc/sys/kernel/ftrace_dump_on_oops
+  echo n > /proc/sys/kernel/ftrace_dump_on_oops
 
+If n = 1, ftrace will dump buffers of all CPUs, if n = 2 ftrace will
+only dump the buffer of the CPU that triggered the oops.
 
 Here's an example of such a dump after a null pointer
 dereference in a kernel module:
@@ -1822,6 +1827,47 @@ this special filter via:
  echo > set_graph_function
 
 
+Filter commands
+---------------
+
+A few commands are supported by the set_ftrace_filter interface.
+Trace commands have the following format:
+
+<function>:<command>:<parameter>
+
+The following commands are supported:
+
+- mod
+  This command enables function filtering per module. The
+  parameter defines the module. For example, if only the write*
+  functions in the ext3 module are desired, run:
+
+   echo 'write*:mod:ext3' > set_ftrace_filter
+
+  This command interacts with the filter in the same way as
+  filtering based on function names. Thus, adding more functions
+  in a different module is accomplished by appending (>>) to the
+  filter file. Remove specific module functions by prepending
+  '!':
+
+   echo '!writeback*:mod:ext3' >> set_ftrace_filter
+
+- traceon/traceoff
+  These commands turn tracing on and off when the specified
+  functions are hit. The parameter determines how many times the
+  tracing system is turned on and off. If unspecified, there is
+  no limit. For example, to disable tracing when a schedule bug
+  is hit the first 5 times, run:
+
+   echo '__schedule_bug:traceoff:5' > set_ftrace_filter
+
+  These commands are cumulative whether or not they are appended
+  to set_ftrace_filter. To remove a command, prepend it by '!'
+  and drop the parameter:
+
+   echo '!__schedule_bug:traceoff' > set_ftrace_filter
+
+
 trace_pipe
 ----------
 
index d906bf19c14a2a309d7aca26d0903a5503987559..a2163c95eb9845ffac908bf09b7af1bb5084cf3c 100644 (file)
@@ -391,7 +391,6 @@ static void __init time_init_wq(void)
        if (time_sync_wq)
                return;
        time_sync_wq = create_singlethread_workqueue("timesync");
-       stop_machine_create();
 }
 
 /*
index 59de2525d3030ac972f48295f1b5191513784790..d4e8b213a46254f34622be1a3026e601e170b463 100644 (file)
@@ -289,7 +289,7 @@ static struct sysrq_key_op sysrq_showstate_blocked_op = {
 
 static void sysrq_ftrace_dump(int key, struct tty_struct *tty)
 {
-       ftrace_dump();
+       ftrace_dump(DUMP_ALL);
 }
 static struct sysrq_key_op sysrq_ftrace_dump_op = {
        .handler        = sysrq_ftrace_dump,
index bd444dc93cf2ebf55c3e8545be249c6f025b7308..8e9dbdc6c7003cd06f3063f7b0d6aaedd5dd1d53 100644 (file)
@@ -73,6 +73,7 @@ enum {DBS_NORMAL_SAMPLE, DBS_SUB_SAMPLE};
 
 struct cpu_dbs_info_s {
        cputime64_t prev_cpu_idle;
+       cputime64_t prev_cpu_iowait;
        cputime64_t prev_cpu_wall;
        cputime64_t prev_cpu_nice;
        struct cpufreq_policy *cur_policy;
@@ -108,6 +109,7 @@ static struct dbs_tuners {
        unsigned int down_differential;
        unsigned int ignore_nice;
        unsigned int powersave_bias;
+       unsigned int io_is_busy;
 } dbs_tuners_ins = {
        .up_threshold = DEF_FREQUENCY_UP_THRESHOLD,
        .down_differential = DEF_FREQUENCY_DOWN_DIFFERENTIAL,
@@ -148,6 +150,16 @@ static inline cputime64_t get_cpu_idle_time(unsigned int cpu, cputime64_t *wall)
        return idle_time;
 }
 
+static inline cputime64_t get_cpu_iowait_time(unsigned int cpu, cputime64_t *wall)
+{
+       u64 iowait_time = get_cpu_iowait_time_us(cpu, wall);
+
+       if (iowait_time == -1ULL)
+               return 0;
+
+       return iowait_time;
+}
+
 /*
  * Find right freq to be set now with powersave_bias on.
  * Returns the freq_hi to be used right now and will set freq_hi_jiffies,
@@ -249,6 +261,7 @@ static ssize_t show_##file_name                                             \
        return sprintf(buf, "%u\n", dbs_tuners_ins.object);             \
 }
 show_one(sampling_rate, sampling_rate);
+show_one(io_is_busy, io_is_busy);
 show_one(up_threshold, up_threshold);
 show_one(ignore_nice_load, ignore_nice);
 show_one(powersave_bias, powersave_bias);
@@ -299,6 +312,23 @@ static ssize_t store_sampling_rate(struct kobject *a, struct attribute *b,
        return count;
 }
 
+static ssize_t store_io_is_busy(struct kobject *a, struct attribute *b,
+                                  const char *buf, size_t count)
+{
+       unsigned int input;
+       int ret;
+
+       ret = sscanf(buf, "%u", &input);
+       if (ret != 1)
+               return -EINVAL;
+
+       mutex_lock(&dbs_mutex);
+       dbs_tuners_ins.io_is_busy = !!input;
+       mutex_unlock(&dbs_mutex);
+
+       return count;
+}
+
 static ssize_t store_up_threshold(struct kobject *a, struct attribute *b,
                                  const char *buf, size_t count)
 {
@@ -381,6 +411,7 @@ static struct global_attr _name = \
 __ATTR(_name, 0644, show_##_name, store_##_name)
 
 define_one_rw(sampling_rate);
+define_one_rw(io_is_busy);
 define_one_rw(up_threshold);
 define_one_rw(ignore_nice_load);
 define_one_rw(powersave_bias);
@@ -392,6 +423,7 @@ static struct attribute *dbs_attributes[] = {
        &up_threshold.attr,
        &ignore_nice_load.attr,
        &powersave_bias.attr,
+       &io_is_busy.attr,
        NULL
 };
 
@@ -470,14 +502,15 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
 
        for_each_cpu(j, policy->cpus) {
                struct cpu_dbs_info_s *j_dbs_info;
-               cputime64_t cur_wall_time, cur_idle_time;
-               unsigned int idle_time, wall_time;
+               cputime64_t cur_wall_time, cur_idle_time, cur_iowait_time;
+               unsigned int idle_time, wall_time, iowait_time;
                unsigned int load, load_freq;
                int freq_avg;
 
                j_dbs_info = &per_cpu(od_cpu_dbs_info, j);
 
                cur_idle_time = get_cpu_idle_time(j, &cur_wall_time);
+               cur_iowait_time = get_cpu_iowait_time(j, &cur_wall_time);
 
                wall_time = (unsigned int) cputime64_sub(cur_wall_time,
                                j_dbs_info->prev_cpu_wall);
@@ -487,6 +520,10 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
                                j_dbs_info->prev_cpu_idle);
                j_dbs_info->prev_cpu_idle = cur_idle_time;
 
+               iowait_time = (unsigned int) cputime64_sub(cur_iowait_time,
+                               j_dbs_info->prev_cpu_iowait);
+               j_dbs_info->prev_cpu_iowait = cur_iowait_time;
+
                if (dbs_tuners_ins.ignore_nice) {
                        cputime64_t cur_nice;
                        unsigned long cur_nice_jiffies;
@@ -504,6 +541,16 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
                        idle_time += jiffies_to_usecs(cur_nice_jiffies);
                }
 
+               /*
+                * For the purpose of ondemand, waiting for disk IO is an
+                * indication that you're performance critical, and not that
+                * the system is actually idle. So subtract the iowait time
+                * from the cpu idle time.
+                */
+
+               if (dbs_tuners_ins.io_is_busy && idle_time >= iowait_time)
+                       idle_time -= iowait_time;
+
                if (unlikely(!wall_time || wall_time < idle_time))
                        continue;
 
@@ -617,6 +664,29 @@ static inline void dbs_timer_exit(struct cpu_dbs_info_s *dbs_info)
        cancel_delayed_work_sync(&dbs_info->work);
 }
 
+/*
+ * Not all CPUs want IO time to be accounted as busy; this dependson how
+ * efficient idling at a higher frequency/voltage is.
+ * Pavel Machek says this is not so for various generations of AMD and old
+ * Intel systems.
+ * Mike Chan (androidlcom) calis this is also not true for ARM.
+ * Because of this, whitelist specific known (series) of CPUs by default, and
+ * leave all others up to the user.
+ */
+static int should_io_be_busy(void)
+{
+#if defined(CONFIG_X86)
+       /*
+        * For Intel, Core 2 (model 15) andl later have an efficient idle.
+        */
+       if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
+           boot_cpu_data.x86 == 6 &&
+           boot_cpu_data.x86_model >= 15)
+               return 1;
+#endif
+       return 0;
+}
+
 static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
                                   unsigned int event)
 {
@@ -679,6 +749,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
                        dbs_tuners_ins.sampling_rate =
                                max(min_sampling_rate,
                                    latency * LATENCY_MULTIPLIER);
+                       dbs_tuners_ins.io_is_busy = should_io_be_busy();
                }
                mutex_unlock(&dbs_mutex);
 
index 166b67ea622f11563a33c539b78c5dafc7c2503f..7581dbe456da91de62aa5f96a8479f9058999390 100644 (file)
@@ -186,14 +186,14 @@ int op_cpu_buffer_write_commit(struct op_entry *entry)
 struct op_sample *op_cpu_buffer_read_entry(struct op_entry *entry, int cpu)
 {
        struct ring_buffer_event *e;
-       e = ring_buffer_consume(op_ring_buffer_read, cpu, NULL);
+       e = ring_buffer_consume(op_ring_buffer_read, cpu, NULL, NULL);
        if (e)
                goto event;
        if (ring_buffer_swap_cpu(op_ring_buffer_read,
                                 op_ring_buffer_write,
                                 cpu))
                return NULL;
-       e = ring_buffer_consume(op_ring_buffer_read, cpu, NULL);
+       e = ring_buffer_consume(op_ring_buffer_read, cpu, NULL, NULL);
        if (e)
                goto event;
        return NULL;
index 2ac4440e7b087c1a9ac9c0b087d09a12be3f3a2e..8943b8ccee1a2ba3c35ac8eabfc14bd716fb136c 100644 (file)
@@ -80,12 +80,6 @@ static void do_suspend(void)
 
        shutting_down = SHUTDOWN_SUSPEND;
 
-       err = stop_machine_create();
-       if (err) {
-               printk(KERN_ERR "xen suspend: failed to setup stop_machine %d\n", err);
-               goto out;
-       }
-
 #ifdef CONFIG_PREEMPT
        /* If the kernel is preemptible, we need to freeze all the processes
           to prevent them from being in the middle of a pagetable update
@@ -93,7 +87,7 @@ static void do_suspend(void)
        err = freeze_processes();
        if (err) {
                printk(KERN_ERR "xen suspend: freeze failed %d\n", err);
-               goto out_destroy_sm;
+               goto out;
        }
 #endif
 
@@ -136,12 +130,8 @@ out_resume:
 out_thaw:
 #ifdef CONFIG_PREEMPT
        thaw_processes();
-
-out_destroy_sm:
-#endif
-       stop_machine_destroy();
-
 out:
+#endif
        shutting_down = SHUTDOWN_INVALID;
 }
 #endif /* CONFIG_PM_SLEEP */
index bd056a5b4efc59ceccb8b121100d40e2bb16cf3f..3817149919cb81fa298686f183f67e0c86fe1c50 100644 (file)
@@ -1140,8 +1140,7 @@ retry:
                 * ep_poll_callback() when events will become available.
                 */
                init_waitqueue_entry(&wait, current);
-               wait.flags |= WQ_FLAG_EXCLUSIVE;
-               __add_wait_queue(&ep->wq, &wait);
+               __add_wait_queue_exclusive(&ep->wq, &wait);
 
                for (;;) {
                        /*
index a5740fc4d04b9415478f4180dcbdad289f5eb281..a73454aec33312359c4233fa4ec7e0598dbbe345 100644 (file)
@@ -21,8 +21,7 @@ extern int number_of_cpusets; /* How many cpusets are defined in system? */
 extern int cpuset_init(void);
 extern void cpuset_init_smp(void);
 extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
-extern void cpuset_cpus_allowed_locked(struct task_struct *p,
-                                      struct cpumask *mask);
+extern int cpuset_cpus_allowed_fallback(struct task_struct *p);
 extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
 #define cpuset_current_mems_allowed (current->mems_allowed)
 void cpuset_init_current_mems_allowed(void);
@@ -69,9 +68,6 @@ struct seq_file;
 extern void cpuset_task_status_allowed(struct seq_file *m,
                                        struct task_struct *task);
 
-extern void cpuset_lock(void);
-extern void cpuset_unlock(void);
-
 extern int cpuset_mem_spread_node(void);
 
 static inline int cpuset_do_page_mem_spread(void)
@@ -105,10 +101,11 @@ static inline void cpuset_cpus_allowed(struct task_struct *p,
 {
        cpumask_copy(mask, cpu_possible_mask);
 }
-static inline void cpuset_cpus_allowed_locked(struct task_struct *p,
-                                             struct cpumask *mask)
+
+static inline int cpuset_cpus_allowed_fallback(struct task_struct *p)
 {
-       cpumask_copy(mask, cpu_possible_mask);
+       cpumask_copy(&p->cpus_allowed, cpu_possible_mask);
+       return cpumask_any(cpu_active_mask);
 }
 
 static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
@@ -157,9 +154,6 @@ static inline void cpuset_task_status_allowed(struct seq_file *m,
 {
 }
 
-static inline void cpuset_lock(void) {}
-static inline void cpuset_unlock(void) {}
-
 static inline int cpuset_mem_spread_node(void)
 {
        return 0;
index cc12b3c556b39aef0e4f15d59e9d1b31946d58e2..41e46330d9bedfd16d46a920cbd840dad54afffe 100644 (file)
@@ -82,9 +82,13 @@ void clear_ftrace_function(void);
 extern void ftrace_stub(unsigned long a0, unsigned long a1);
 
 #else /* !CONFIG_FUNCTION_TRACER */
-# define register_ftrace_function(ops) do { } while (0)
-# define unregister_ftrace_function(ops) do { } while (0)
-# define clear_ftrace_function(ops) do { } while (0)
+/*
+ * (un)register_ftrace_function must be a macro since the ops parameter
+ * must not be evaluated.
+ */
+#define register_ftrace_function(ops) ({ 0; })
+#define unregister_ftrace_function(ops) ({ 0; })
+static inline void clear_ftrace_function(void) { }
 static inline void ftrace_kill(void) { }
 static inline void ftrace_stop(void) { }
 static inline void ftrace_start(void) { }
@@ -237,11 +241,13 @@ extern int skip_trace(unsigned long ip);
 extern void ftrace_disable_daemon(void);
 extern void ftrace_enable_daemon(void);
 #else
-# define skip_trace(ip)                                ({ 0; })
-# define ftrace_force_update()                 ({ 0; })
-# define ftrace_set_filter(buf, len, reset)    do { } while (0)
-# define ftrace_disable_daemon()               do { } while (0)
-# define ftrace_enable_daemon()                        do { } while (0)
+static inline int skip_trace(unsigned long ip) { return 0; }
+static inline int ftrace_force_update(void) { return 0; }
+static inline void ftrace_set_filter(unsigned char *buf, int len, int reset)
+{
+}
+static inline void ftrace_disable_daemon(void) { }
+static inline void ftrace_enable_daemon(void) { }
 static inline void ftrace_release_mod(struct module *mod) {}
 static inline int register_ftrace_command(struct ftrace_func_command *cmd)
 {
@@ -314,16 +320,16 @@ static inline void __ftrace_enabled_restore(int enabled)
   extern void time_hardirqs_on(unsigned long a0, unsigned long a1);
   extern void time_hardirqs_off(unsigned long a0, unsigned long a1);
 #else
-# define time_hardirqs_on(a0, a1)              do { } while (0)
-# define time_hardirqs_off(a0, a1)             do { } while (0)
+  static inline void time_hardirqs_on(unsigned long a0, unsigned long a1) { }
+  static inline void time_hardirqs_off(unsigned long a0, unsigned long a1) { }
 #endif
 
 #ifdef CONFIG_PREEMPT_TRACER
   extern void trace_preempt_on(unsigned long a0, unsigned long a1);
   extern void trace_preempt_off(unsigned long a0, unsigned long a1);
 #else
-# define trace_preempt_on(a0, a1)              do { } while (0)
-# define trace_preempt_off(a0, a1)             do { } while (0)
+  static inline void trace_preempt_on(unsigned long a0, unsigned long a1) { }
+  static inline void trace_preempt_off(unsigned long a0, unsigned long a1) { }
 #endif
 
 #ifdef CONFIG_FTRACE_MCOUNT_RECORD
@@ -352,6 +358,10 @@ struct ftrace_graph_ret {
        int depth;
 };
 
+/* Type of the callback handlers for tracing function graph*/
+typedef void (*trace_func_graph_ret_t)(struct ftrace_graph_ret *); /* return */
+typedef int (*trace_func_graph_ent_t)(struct ftrace_graph_ent *); /* entry */
+
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 
 /* for init task */
@@ -400,10 +410,6 @@ extern char __irqentry_text_end[];
 
 #define FTRACE_RETFUNC_DEPTH 50
 #define FTRACE_RETSTACK_ALLOC_SIZE 32
-/* Type of the callback handlers for tracing function graph*/
-typedef void (*trace_func_graph_ret_t)(struct ftrace_graph_ret *); /* return */
-typedef int (*trace_func_graph_ent_t)(struct ftrace_graph_ent *); /* entry */
-
 extern int register_ftrace_graph(trace_func_graph_ret_t retfunc,
                                trace_func_graph_ent_t entryfunc);
 
@@ -441,6 +447,13 @@ static inline void unpause_graph_tracing(void)
 static inline void ftrace_graph_init_task(struct task_struct *t) { }
 static inline void ftrace_graph_exit_task(struct task_struct *t) { }
 
+static inline int register_ftrace_graph(trace_func_graph_ret_t retfunc,
+                         trace_func_graph_ent_t entryfunc)
+{
+       return -1;
+}
+static inline void unregister_ftrace_graph(void) { }
+
 static inline int task_curr_ret_stack(struct task_struct *tsk)
 {
        return -1;
@@ -492,7 +505,9 @@ static inline int test_tsk_trace_graph(struct task_struct *tsk)
        return tsk->trace & TSK_TRACE_FL_GRAPH;
 }
 
-extern int ftrace_dump_on_oops;
+enum ftrace_dump_mode;
+
+extern enum ftrace_dump_mode ftrace_dump_on_oops;
 
 #ifdef CONFIG_PREEMPT
 #define INIT_TRACE_RECURSION           .trace_recursion = 0,
index c0f4b364c711d172c23919b010963620222a2344..dc7fc646fa2e7a9adfb1d2241f09ee3b11c47b81 100644 (file)
@@ -58,6 +58,7 @@ struct trace_iterator {
        /* The below is zeroed out in pipe_read */
        struct trace_seq        seq;
        struct trace_entry      *ent;
+       unsigned long           lost_events;
        int                     leftover;
        int                     cpu;
        u64                     ts;
@@ -69,18 +70,25 @@ struct trace_iterator {
 };
 
 
+struct trace_event;
+
 typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter,
-                                             int flags);
-struct trace_event {
-       struct hlist_node       node;
-       struct list_head        list;
-       int                     type;
+                                     int flags, struct trace_event *event);
+
+struct trace_event_functions {
        trace_print_func        trace;
        trace_print_func        raw;
        trace_print_func        hex;
        trace_print_func        binary;
 };
 
+struct trace_event {
+       struct hlist_node               node;
+       struct list_head                list;
+       int                             type;
+       struct trace_event_functions    *funcs;
+};
+
 extern int register_ftrace_event(struct trace_event *event);
 extern int unregister_ftrace_event(struct trace_event *event);
 
@@ -112,28 +120,67 @@ void tracing_record_cmdline(struct task_struct *tsk);
 
 struct event_filter;
 
+enum trace_reg {
+       TRACE_REG_REGISTER,
+       TRACE_REG_UNREGISTER,
+       TRACE_REG_PERF_REGISTER,
+       TRACE_REG_PERF_UNREGISTER,
+};
+
+struct ftrace_event_call;
+
+struct ftrace_event_class {
+       char                    *system;
+       void                    *probe;
+#ifdef CONFIG_PERF_EVENTS
+       void                    *perf_probe;
+#endif
+       int                     (*reg)(struct ftrace_event_call *event,
+                                      enum trace_reg type);
+       int                     (*define_fields)(struct ftrace_event_call *);
+       struct list_head        *(*get_fields)(struct ftrace_event_call *);
+       struct list_head        fields;
+       int                     (*raw_init)(struct ftrace_event_call *);
+};
+
+enum {
+       TRACE_EVENT_FL_ENABLED_BIT,
+       TRACE_EVENT_FL_FILTERED_BIT,
+};
+
+enum {
+       TRACE_EVENT_FL_ENABLED  = (1 << TRACE_EVENT_FL_ENABLED_BIT),
+       TRACE_EVENT_FL_FILTERED = (1 << TRACE_EVENT_FL_FILTERED_BIT),
+};
+
 struct ftrace_event_call {
        struct list_head        list;
+       struct ftrace_event_class *class;
        char                    *name;
-       char                    *system;
        struct dentry           *dir;
-       struct trace_event      *event;
-       int                     enabled;
-       int                     (*regfunc)(struct ftrace_event_call *);
-       void                    (*unregfunc)(struct ftrace_event_call *);
-       int                     id;
+       struct trace_event      event;
        const char              *print_fmt;
-       int                     (*raw_init)(struct ftrace_event_call *);
-       int                     (*define_fields)(struct ftrace_event_call *);
-       struct list_head        fields;
-       int                     filter_active;
        struct event_filter     *filter;
        void                    *mod;
        void                    *data;
 
+       /*
+        * 32 bit flags:
+        *   bit 1:             enabled
+        *   bit 2:             filter_active
+        *
+        * Changes to flags must hold the event_mutex.
+        *
+        * Note: Reads of flags do not hold the event_mutex since
+        * they occur in critical sections. But the way flags
+        * is currently used, these changes do no affect the code
+        * except that when a change is made, it may have a slight
+        * delay in propagating the changes to other CPUs due to
+        * caching and such.
+        */
+       unsigned int            flags;
+
        int                     perf_refcount;
-       int                     (*perf_event_enable)(struct ftrace_event_call *);
-       void                    (*perf_event_disable)(struct ftrace_event_call *);
 };
 
 #define PERF_MAX_TRACE_SIZE    2048
index 9365227dbaf6498f4b4318c7205e8e1a61103da5..9fb1c1299032ca8a051d969b8ab2bde46ea707b1 100644 (file)
@@ -490,6 +490,13 @@ static inline void tracing_off(void) { }
 static inline void tracing_off_permanent(void) { }
 static inline int tracing_is_on(void) { return 0; }
 #endif
+
+enum ftrace_dump_mode {
+       DUMP_NONE,
+       DUMP_ALL,
+       DUMP_ORIG,
+};
+
 #ifdef CONFIG_TRACING
 extern void tracing_start(void);
 extern void tracing_stop(void);
@@ -571,7 +578,7 @@ __ftrace_vbprintk(unsigned long ip, const char *fmt, va_list ap);
 extern int
 __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap);
 
-extern void ftrace_dump(void);
+extern void ftrace_dump(enum ftrace_dump_mode oops_dump_mode);
 #else
 static inline void
 ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { }
@@ -592,7 +599,7 @@ ftrace_vprintk(const char *fmt, va_list ap)
 {
        return 0;
 }
-static inline void ftrace_dump(void) { }
+static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { }
 #endif /* CONFIG_TRACING */
 
 /*
index 515d53ae6a795e9b73bbf0388aa8966be1014924..6914fcad46733974df83c76c5d7e2d96008793be 100644 (file)
@@ -465,8 +465,7 @@ static inline void __module_get(struct module *module)
        if (module) {
                preempt_disable();
                __this_cpu_inc(module->refptr->incs);
-               trace_module_get(module, _THIS_IP_,
-                                __this_cpu_read(module->refptr->incs));
+               trace_module_get(module, _THIS_IP_);
                preempt_enable();
        }
 }
@@ -480,8 +479,7 @@ static inline int try_module_get(struct module *module)
 
                if (likely(module_is_live(module))) {
                        __this_cpu_inc(module->refptr->incs);
-                       trace_module_get(module, _THIS_IP_,
-                               __this_cpu_read(module->refptr->incs));
+                       trace_module_get(module, _THIS_IP_);
                } else
                        ret = 0;
 
index a5195875480aa299d96e0649f524a184cdd88413..0006b2df00e1ce14a186c48817db493e39bd58c3 100644 (file)
@@ -60,8 +60,6 @@ static inline long rcu_batches_completed_bh(void)
        return 0;
 }
 
-extern int rcu_expedited_torture_stats(char *page);
-
 static inline void rcu_force_quiescent_state(void)
 {
 }
index 42cc3a04779ee1376adce84aeb57655df656d06f..24e467e526b83d67b7ed4bbb79ac9b0576b88844 100644 (file)
@@ -35,7 +35,6 @@ struct notifier_block;
 extern void rcu_sched_qs(int cpu);
 extern void rcu_bh_qs(int cpu);
 extern int rcu_needs_cpu(int cpu);
-extern int rcu_expedited_torture_stats(char *page);
 
 #ifdef CONFIG_TREE_PREEMPT_RCU
 
index 5fcc31ed5771358b625eb24434e656e28f016cd8..25b4f686d9189242f6e6d817b563101f7ce07262 100644 (file)
@@ -120,12 +120,16 @@ int ring_buffer_write(struct ring_buffer *buffer,
                      unsigned long length, void *data);
 
 struct ring_buffer_event *
-ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts);
+ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts,
+                unsigned long *lost_events);
 struct ring_buffer_event *
-ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts);
+ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
+                   unsigned long *lost_events);
 
 struct ring_buffer_iter *
-ring_buffer_read_start(struct ring_buffer *buffer, int cpu);
+ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu);
+void ring_buffer_read_prepare_sync(void);
+void ring_buffer_read_start(struct ring_buffer_iter *iter);
 void ring_buffer_read_finish(struct ring_buffer_iter *iter);
 
 struct ring_buffer_event *
index e0447c64af6ad4bd84cc33e852809183ab449dc5..2a5b146fbaf9e4a2a39a9b112f8a83cb513be6ad 100644 (file)
@@ -274,11 +274,17 @@ extern cpumask_var_t nohz_cpu_mask;
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
 extern int select_nohz_load_balancer(int cpu);
 extern int get_nohz_load_balancer(void);
+extern int nohz_ratelimit(int cpu);
 #else
 static inline int select_nohz_load_balancer(int cpu)
 {
        return 0;
 }
+
+static inline int nohz_ratelimit(int cpu)
+{
+       return 0;
+}
 #endif
 
 /*
@@ -953,6 +959,7 @@ struct sched_domain {
        char *name;
 #endif
 
+       unsigned int span_weight;
        /*
         * Span of all CPUs in this domain.
         *
@@ -1025,12 +1032,17 @@ struct sched_domain;
 #define WF_SYNC                0x01            /* waker goes to sleep after wakup */
 #define WF_FORK                0x02            /* child wakeup after fork */
 
+#define ENQUEUE_WAKEUP         1
+#define ENQUEUE_WAKING         2
+#define ENQUEUE_HEAD           4
+
+#define DEQUEUE_SLEEP          1
+
 struct sched_class {
        const struct sched_class *next;
 
-       void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup,
-                             bool head);
-       void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep);
+       void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
+       void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
        void (*yield_task) (struct rq *rq);
 
        void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
@@ -1039,7 +1051,8 @@ struct sched_class {
        void (*put_prev_task) (struct rq *rq, struct task_struct *p);
 
 #ifdef CONFIG_SMP
-       int  (*select_task_rq)(struct task_struct *p, int sd_flag, int flags);
+       int  (*select_task_rq)(struct rq *rq, struct task_struct *p,
+                              int sd_flag, int flags);
 
        void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
        void (*post_schedule) (struct rq *this_rq);
@@ -1076,36 +1089,8 @@ struct load_weight {
        unsigned long weight, inv_weight;
 };
 
-/*
- * CFS stats for a schedulable entity (task, task-group etc)
- *
- * Current field usage histogram:
- *
- *     4 se->block_start
- *     4 se->run_node
- *     4 se->sleep_start
- *     6 se->load.weight
- */
-struct sched_entity {
-       struct load_weight      load;           /* for load-balancing */
-       struct rb_node          run_node;
-       struct list_head        group_node;
-       unsigned int            on_rq;
-
-       u64                     exec_start;
-       u64                     sum_exec_runtime;
-       u64                     vruntime;
-       u64                     prev_sum_exec_runtime;
-
-       u64                     last_wakeup;
-       u64                     avg_overlap;
-
-       u64                     nr_migrations;
-
-       u64                     start_runtime;
-       u64                     avg_wakeup;
-
 #ifdef CONFIG_SCHEDSTATS
+struct sched_statistics {
        u64                     wait_start;
        u64                     wait_max;
        u64                     wait_count;
@@ -1137,6 +1122,24 @@ struct sched_entity {
        u64                     nr_wakeups_affine_attempts;
        u64                     nr_wakeups_passive;
        u64                     nr_wakeups_idle;
+};
+#endif
+
+struct sched_entity {
+       struct load_weight      load;           /* for load-balancing */
+       struct rb_node          run_node;
+       struct list_head        group_node;
+       unsigned int            on_rq;
+
+       u64                     exec_start;
+       u64                     sum_exec_runtime;
+       u64                     vruntime;
+       u64                     prev_sum_exec_runtime;
+
+       u64                     nr_migrations;
+
+#ifdef CONFIG_SCHEDSTATS
+       struct sched_statistics statistics;
 #endif
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1840,6 +1843,7 @@ extern void sched_clock_idle_sleep_event(void);
 extern void sched_clock_idle_wakeup_event(u64 delta_ns);
 
 #ifdef CONFIG_HOTPLUG_CPU
+extern void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p);
 extern void idle_task_exit(void);
 #else
 static inline void idle_task_exit(void) {}
index baba3a23a8145c6106177f924dd712b63e7b2f17..6b524a0d02e42b14419c5ae75133360a1f4874a5 100644 (file)
 #ifndef _LINUX_STOP_MACHINE
 #define _LINUX_STOP_MACHINE
-/* "Bogolock": stop the entire machine, disable interrupts.  This is a
-   very heavy lock, which is equivalent to grabbing every spinlock
-   (and more).  So the "read" side to such a lock is anything which
-   disables preeempt. */
+
 #include <linux/cpu.h>
 #include <linux/cpumask.h>
+#include <linux/list.h>
 #include <asm/system.h>
 
+/*
+ * stop_cpu[s]() is simplistic per-cpu maximum priority cpu
+ * monopolization mechanism.  The caller can specify a non-sleeping
+ * function to be executed on a single or multiple cpus preempting all
+ * other processes and monopolizing those cpus until it finishes.
+ *
+ * Resources for this mechanism are preallocated when a cpu is brought
+ * up and requests are guaranteed to be served as long as the target
+ * cpus are online.
+ */
+typedef int (*cpu_stop_fn_t)(void *arg);
+
+#ifdef CONFIG_SMP
+
+struct cpu_stop_work {
+       struct list_head        list;           /* cpu_stopper->works */
+       cpu_stop_fn_t           fn;
+       void                    *arg;
+       struct cpu_stop_done    *done;
+};
+
+int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg);
+void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
+                        struct cpu_stop_work *work_buf);
+int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);
+int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);
+
+#else  /* CONFIG_SMP */
+
+#include <linux/workqueue.h>
+
+struct cpu_stop_work {
+       struct work_struct      work;
+       cpu_stop_fn_t           fn;
+       void                    *arg;
+};
+
+static inline int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
+{
+       int ret = -ENOENT;
+       preempt_disable();
+       if (cpu == smp_processor_id())
+               ret = fn(arg);
+       preempt_enable();
+       return ret;
+}
+
+static void stop_one_cpu_nowait_workfn(struct work_struct *work)
+{
+       struct cpu_stop_work *stwork =
+               container_of(work, struct cpu_stop_work, work);
+       preempt_disable();
+       stwork->fn(stwork->arg);
+       preempt_enable();
+}
+
+static inline void stop_one_cpu_nowait(unsigned int cpu,
+                                      cpu_stop_fn_t fn, void *arg,
+                                      struct cpu_stop_work *work_buf)
+{
+       if (cpu == smp_processor_id()) {
+               INIT_WORK(&work_buf->work, stop_one_cpu_nowait_workfn);
+               work_buf->fn = fn;
+               work_buf->arg = arg;
+               schedule_work(&work_buf->work);
+       }
+}
+
+static inline int stop_cpus(const struct cpumask *cpumask,
+                           cpu_stop_fn_t fn, void *arg)
+{
+       if (cpumask_test_cpu(raw_smp_processor_id(), cpumask))
+               return stop_one_cpu(raw_smp_processor_id(), fn, arg);
+       return -ENOENT;
+}
+
+static inline int try_stop_cpus(const struct cpumask *cpumask,
+                               cpu_stop_fn_t fn, void *arg)
+{
+       return stop_cpus(cpumask, fn, arg);
+}
+
+#endif /* CONFIG_SMP */
+
+/*
+ * stop_machine "Bogolock": stop the entire machine, disable
+ * interrupts.  This is a very heavy lock, which is equivalent to
+ * grabbing every spinlock (and more).  So the "read" side to such a
+ * lock is anything which disables preeempt.
+ */
 #if defined(CONFIG_STOP_MACHINE) && defined(CONFIG_SMP)
 
 /**
@@ -36,24 +124,7 @@ int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus);
  */
 int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus);
 
-/**
- * stop_machine_create: create all stop_machine threads
- *
- * Description: This causes all stop_machine threads to be created before
- * stop_machine actually gets called. This can be used by subsystems that
- * need a non failing stop_machine infrastructure.
- */
-int stop_machine_create(void);
-
-/**
- * stop_machine_destroy: destroy all stop_machine threads
- *
- * Description: This causes all stop_machine threads which were created with
- * stop_machine_create to be destroyed again.
- */
-void stop_machine_destroy(void);
-
-#else
+#else   /* CONFIG_STOP_MACHINE && CONFIG_SMP */
 
 static inline int stop_machine(int (*fn)(void *), void *data,
                               const struct cpumask *cpus)
@@ -65,8 +136,5 @@ static inline int stop_machine(int (*fn)(void *), void *data,
        return ret;
 }
 
-static inline int stop_machine_create(void) { return 0; }
-static inline void stop_machine_destroy(void) { }
-
-#endif /* CONFIG_SMP */
-#endif /* _LINUX_STOP_MACHINE */
+#endif /* CONFIG_STOP_MACHINE && CONFIG_SMP */
+#endif /* _LINUX_STOP_MACHINE */
index 057929b0a6514963b66098e9f3ff58129937ffa5..a1a86a53bc735c13cdda531309aa6c8158231909 100644 (file)
@@ -103,22 +103,6 @@ struct perf_event_attr;
 #define __SC_TEST5(t5, a5, ...)        __SC_TEST(t5); __SC_TEST4(__VA_ARGS__)
 #define __SC_TEST6(t6, a6, ...)        __SC_TEST(t6); __SC_TEST5(__VA_ARGS__)
 
-#ifdef CONFIG_PERF_EVENTS
-
-#define TRACE_SYS_ENTER_PERF_INIT(sname)                                      \
-       .perf_event_enable = perf_sysenter_enable,                             \
-       .perf_event_disable = perf_sysenter_disable,
-
-#define TRACE_SYS_EXIT_PERF_INIT(sname)                                               \
-       .perf_event_enable = perf_sysexit_enable,                              \
-       .perf_event_disable = perf_sysexit_disable,
-#else
-#define TRACE_SYS_ENTER_PERF(sname)
-#define TRACE_SYS_ENTER_PERF_INIT(sname)
-#define TRACE_SYS_EXIT_PERF(sname)
-#define TRACE_SYS_EXIT_PERF_INIT(sname)
-#endif /* CONFIG_PERF_EVENTS */
-
 #ifdef CONFIG_FTRACE_SYSCALLS
 #define __SC_STR_ADECL1(t, a)          #a
 #define __SC_STR_ADECL2(t, a, ...)     #a, __SC_STR_ADECL1(__VA_ARGS__)
@@ -134,54 +118,43 @@ struct perf_event_attr;
 #define __SC_STR_TDECL5(t, a, ...)     #t, __SC_STR_TDECL4(__VA_ARGS__)
 #define __SC_STR_TDECL6(t, a, ...)     #t, __SC_STR_TDECL5(__VA_ARGS__)
 
+extern struct ftrace_event_class event_class_syscall_enter;
+extern struct ftrace_event_class event_class_syscall_exit;
+extern struct trace_event_functions enter_syscall_print_funcs;
+extern struct trace_event_functions exit_syscall_print_funcs;
+
 #define SYSCALL_TRACE_ENTER_EVENT(sname)                               \
-       static const struct syscall_metadata __syscall_meta_##sname;    \
+       static struct syscall_metadata __syscall_meta_##sname;          \
        static struct ftrace_event_call                                 \
        __attribute__((__aligned__(4))) event_enter_##sname;            \
-       static struct trace_event enter_syscall_print_##sname = {       \
-               .trace                  = print_syscall_enter,          \
-       };                                                              \
        static struct ftrace_event_call __used                          \
          __attribute__((__aligned__(4)))                               \
          __attribute__((section("_ftrace_events")))                    \
          event_enter_##sname = {                                       \
                .name                   = "sys_enter"#sname,            \
-               .system                 = "syscalls",                   \
-               .event                  = &enter_syscall_print_##sname, \
-               .raw_init               = init_syscall_trace,           \
-               .define_fields          = syscall_enter_define_fields,  \
-               .regfunc                = reg_event_syscall_enter,      \
-               .unregfunc              = unreg_event_syscall_enter,    \
+               .class                  = &event_class_syscall_enter,   \
+               .event.funcs            = &enter_syscall_print_funcs,   \
                .data                   = (void *)&__syscall_meta_##sname,\
-               TRACE_SYS_ENTER_PERF_INIT(sname)                        \
        }
 
 #define SYSCALL_TRACE_EXIT_EVENT(sname)                                        \
-       static const struct syscall_metadata __syscall_meta_##sname;    \
+       static struct syscall_metadata __syscall_meta_##sname;          \
        static struct ftrace_event_call                                 \
        __attribute__((__aligned__(4))) event_exit_##sname;             \
-       static struct trace_event exit_syscall_print_##sname = {        \
-               .trace                  = print_syscall_exit,           \
-       };                                                              \
        static struct ftrace_event_call __used                          \
          __attribute__((__aligned__(4)))                               \
          __attribute__((section("_ftrace_events")))                    \
          event_exit_##sname = {                                        \
                .name                   = "sys_exit"#sname,             \
-               .system                 = "syscalls",                   \
-               .event                  = &exit_syscall_print_##sname,  \
-               .raw_init               = init_syscall_trace,           \
-               .define_fields          = syscall_exit_define_fields,   \
-               .regfunc                = reg_event_syscall_exit,       \
-               .unregfunc              = unreg_event_syscall_exit,     \
+               .class                  = &event_class_syscall_exit,    \
+               .event.funcs            = &exit_syscall_print_funcs,    \
                .data                   = (void *)&__syscall_meta_##sname,\
-               TRACE_SYS_EXIT_PERF_INIT(sname)                 \
        }
 
 #define SYSCALL_METADATA(sname, nb)                            \
        SYSCALL_TRACE_ENTER_EVENT(sname);                       \
        SYSCALL_TRACE_EXIT_EVENT(sname);                        \
-       static const struct syscall_metadata __used             \
+       static struct syscall_metadata __used                   \
          __attribute__((__aligned__(4)))                       \
          __attribute__((section("__syscalls_metadata")))       \
          __syscall_meta_##sname = {                            \
@@ -191,12 +164,14 @@ struct perf_event_attr;
                .args           = args_##sname,                 \
                .enter_event    = &event_enter_##sname,         \
                .exit_event     = &event_exit_##sname,          \
+               .enter_fields   = LIST_HEAD_INIT(__syscall_meta_##sname.enter_fields), \
+               .exit_fields    = LIST_HEAD_INIT(__syscall_meta_##sname.exit_fields), \
        };
 
 #define SYSCALL_DEFINE0(sname)                                 \
        SYSCALL_TRACE_ENTER_EVENT(_##sname);                    \
        SYSCALL_TRACE_EXIT_EVENT(_##sname);                     \
-       static const struct syscall_metadata __used             \
+       static struct syscall_metadata __used                   \
          __attribute__((__aligned__(4)))                       \
          __attribute__((section("__syscalls_metadata")))       \
          __syscall_meta__##sname = {                           \
@@ -204,6 +179,8 @@ struct perf_event_attr;
                .nb_args        = 0,                            \
                .enter_event    = &event_enter__##sname,        \
                .exit_event     = &event_exit__##sname,         \
+               .enter_fields   = LIST_HEAD_INIT(__syscall_meta__##sname.enter_fields), \
+               .exit_fields    = LIST_HEAD_INIT(__syscall_meta__##sname.exit_fields), \
        };                                                      \
        asmlinkage long sys_##sname(void)
 #else
index d2ae79e21be3a4aee4fe1cd8dde51d272d7b97dc..b232ccc0ee291d8297d1fd40b07ab999979bc3cb 100644 (file)
@@ -42,6 +42,7 @@ enum tick_nohz_mode {
  * @idle_waketime:     Time when the idle was interrupted
  * @idle_exittime:     Time when the idle state was left
  * @idle_sleeptime:    Sum of the time slept in idle with sched tick stopped
+ * @iowait_sleeptime:  Sum of the time slept in idle with sched tick stopped, with IO outstanding
  * @sleep_length:      Duration of the current idle sleep
  * @do_timer_lst:      CPU was the last one doing do_timer before going idle
  */
@@ -60,7 +61,7 @@ struct tick_sched {
        ktime_t                         idle_waketime;
        ktime_t                         idle_exittime;
        ktime_t                         idle_sleeptime;
-       ktime_t                         idle_lastupdate;
+       ktime_t                         iowait_sleeptime;
        ktime_t                         sleep_length;
        unsigned long                   last_jiffies;
        unsigned long                   next_jiffies;
@@ -124,6 +125,7 @@ extern void tick_nohz_stop_sched_tick(int inidle);
 extern void tick_nohz_restart_sched_tick(void);
 extern ktime_t tick_nohz_get_sleep_length(void);
 extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time);
+extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time);
 # else
 static inline void tick_nohz_stop_sched_tick(int inidle) { }
 static inline void tick_nohz_restart_sched_tick(void) { }
@@ -134,6 +136,7 @@ static inline ktime_t tick_nohz_get_sleep_length(void)
        return len;
 }
 static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; }
+static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; }
 # endif /* !NO_HZ */
 
 #endif
index 78b4bd3be496c22f96fb6bc6400d2e831b6ce210..9a59d1f98cd4114948129d8e9ad95b77da27ec6d 100644 (file)
 struct module;
 struct tracepoint;
 
+struct tracepoint_func {
+       void *func;
+       void *data;
+};
+
 struct tracepoint {
        const char *name;               /* Tracepoint name */
        int state;                      /* State. */
        void (*regfunc)(void);
        void (*unregfunc)(void);
-       void **funcs;
+       struct tracepoint_func *funcs;
 } __attribute__((aligned(32)));                /*
                                         * Aligned on 32 bytes because it is
                                         * globally visible and gcc happily
@@ -33,6 +38,68 @@ struct tracepoint {
                                         * Keep in sync with vmlinux.lds.h.
                                         */
 
+/*
+ * Connect a probe to a tracepoint.
+ * Internal API, should not be used directly.
+ */
+extern int tracepoint_probe_register(const char *name, void *probe, void *data);
+
+/*
+ * Disconnect a probe from a tracepoint.
+ * Internal API, should not be used directly.
+ */
+extern int
+tracepoint_probe_unregister(const char *name, void *probe, void *data);
+
+extern int tracepoint_probe_register_noupdate(const char *name, void *probe,
+                                             void *data);
+extern int tracepoint_probe_unregister_noupdate(const char *name, void *probe,
+                                               void *data);
+extern void tracepoint_probe_update_all(void);
+
+struct tracepoint_iter {
+       struct module *module;
+       struct tracepoint *tracepoint;
+};
+
+extern void tracepoint_iter_start(struct tracepoint_iter *iter);
+extern void tracepoint_iter_next(struct tracepoint_iter *iter);
+extern void tracepoint_iter_stop(struct tracepoint_iter *iter);
+extern void tracepoint_iter_reset(struct tracepoint_iter *iter);
+extern int tracepoint_get_iter_range(struct tracepoint **tracepoint,
+       struct tracepoint *begin, struct tracepoint *end);
+
+/*
+ * tracepoint_synchronize_unregister must be called between the last tracepoint
+ * probe unregistration and the end of module exit to make sure there is no
+ * caller executing a probe when it is freed.
+ */
+static inline void tracepoint_synchronize_unregister(void)
+{
+       synchronize_sched();
+}
+
+#define PARAMS(args...) args
+
+#ifdef CONFIG_TRACEPOINTS
+extern void tracepoint_update_probe_range(struct tracepoint *begin,
+       struct tracepoint *end);
+#else
+static inline void tracepoint_update_probe_range(struct tracepoint *begin,
+       struct tracepoint *end)
+{ }
+#endif /* CONFIG_TRACEPOINTS */
+
+#endif /* _LINUX_TRACEPOINT_H */
+
+/*
+ * Note: we keep the TRACE_EVENT and DECLARE_TRACE outside the include
+ *  file ifdef protection.
+ *  This is due to the way trace events work. If a file includes two
+ *  trace event headers under one "CREATE_TRACE_POINTS" the first include
+ *  will override the TRACE_EVENT and break the second include.
+ */
+
 #ifndef DECLARE_TRACE
 
 #define TP_PROTO(args...)      args
@@ -43,17 +110,27 @@ struct tracepoint {
 /*
  * it_func[0] is never NULL because there is at least one element in the array
  * when the array itself is non NULL.
+ *
+ * Note, the proto and args passed in includes "__data" as the first parameter.
+ * The reason for this is to handle the "void" prototype. If a tracepoint
+ * has a "void" prototype, then it is invalid to declare a function
+ * as "(void *, void)". The DECLARE_TRACE_NOARGS() will pass in just
+ * "void *data", where as the DECLARE_TRACE() will pass in "void *data, proto".
  */
 #define __DO_TRACE(tp, proto, args)                                    \
        do {                                                            \
-               void **it_func;                                         \
+               struct tracepoint_func *it_func_ptr;                    \
+               void *it_func;                                          \
+               void *__data;                                           \
                                                                        \
                rcu_read_lock_sched_notrace();                          \
-               it_func = rcu_dereference_sched((tp)->funcs);           \
-               if (it_func) {                                          \
+               it_func_ptr = rcu_dereference_sched((tp)->funcs);       \
+               if (it_func_ptr) {                                      \
                        do {                                            \
-                               ((void(*)(proto))(*it_func))(args);     \
-                       } while (*(++it_func));                         \
+                               it_func = (it_func_ptr)->func;          \
+                               __data = (it_func_ptr)->data;           \
+                               ((void(*)(proto))(it_func))(args);      \
+                       } while ((++it_func_ptr)->func);                \
                }                                                       \
                rcu_read_unlock_sched_notrace();                        \
        } while (0)
@@ -63,24 +140,32 @@ struct tracepoint {
  * not add unwanted padding between the beginning of the section and the
  * structure. Force alignment to the same alignment as the section start.
  */
-#define DECLARE_TRACE(name, proto, args)                               \
+#define __DECLARE_TRACE(name, proto, args, data_proto, data_args)      \
        extern struct tracepoint __tracepoint_##name;                   \
        static inline void trace_##name(proto)                          \
        {                                                               \
                if (unlikely(__tracepoint_##name.state))                \
                        __DO_TRACE(&__tracepoint_##name,                \
-                               TP_PROTO(proto), TP_ARGS(args));        \
+                               TP_PROTO(data_proto),                   \
+                               TP_ARGS(data_args));                    \
        }                                                               \
-       static inline int register_trace_##name(void (*probe)(proto))   \
+       static inline int                                               \
+       register_trace_##name(void (*probe)(data_proto), void *data)    \
        {                                                               \
-               return tracepoint_probe_register(#name, (void *)probe); \
+               return tracepoint_probe_register(#name, (void *)probe,  \
+                                                data);                 \
        }                                                               \
-       static inline int unregister_trace_##name(void (*probe)(proto)) \
+       static inline int                                               \
+       unregister_trace_##name(void (*probe)(data_proto), void *data)  \
+       {                                                               \
+               return tracepoint_probe_unregister(#name, (void *)probe, \
+                                                  data);               \
+       }                                                               \
+       static inline void                                              \
+       check_trace_callback_type_##name(void (*cb)(data_proto))        \
        {                                                               \
-               return tracepoint_probe_unregister(#name, (void *)probe);\
        }
 
-
 #define DEFINE_TRACE_FN(name, reg, unreg)                              \
        static const char __tpstrtab_##name[]                           \
        __attribute__((section("__tracepoints_strings"))) = #name;      \
@@ -96,22 +181,24 @@ struct tracepoint {
 #define EXPORT_TRACEPOINT_SYMBOL(name)                                 \
        EXPORT_SYMBOL(__tracepoint_##name)
 
-extern void tracepoint_update_probe_range(struct tracepoint *begin,
-       struct tracepoint *end);
-
 #else /* !CONFIG_TRACEPOINTS */
-#define DECLARE_TRACE(name, proto, args)                               \
-       static inline void _do_trace_##name(struct tracepoint *tp, proto) \
-       { }                                                             \
+#define __DECLARE_TRACE(name, proto, args, data_proto, data_args)      \
        static inline void trace_##name(proto)                          \
        { }                                                             \
-       static inline int register_trace_##name(void (*probe)(proto))   \
+       static inline int                                               \
+       register_trace_##name(void (*probe)(data_proto),                \
+                             void *data)                               \
        {                                                               \
                return -ENOSYS;                                         \
        }                                                               \
-       static inline int unregister_trace_##name(void (*probe)(proto)) \
+       static inline int                                               \
+       unregister_trace_##name(void (*probe)(data_proto),              \
+                               void *data)                             \
        {                                                               \
                return -ENOSYS;                                         \
+       }                                                               \
+       static inline void check_trace_callback_type_##name(void (*cb)(data_proto)) \
+       {                                                               \
        }
 
 #define DEFINE_TRACE_FN(name, reg, unreg)
@@ -119,60 +206,31 @@ extern void tracepoint_update_probe_range(struct tracepoint *begin,
 #define EXPORT_TRACEPOINT_SYMBOL_GPL(name)
 #define EXPORT_TRACEPOINT_SYMBOL(name)
 
-static inline void tracepoint_update_probe_range(struct tracepoint *begin,
-       struct tracepoint *end)
-{ }
 #endif /* CONFIG_TRACEPOINTS */
-#endif /* DECLARE_TRACE */
-
-/*
- * Connect a probe to a tracepoint.
- * Internal API, should not be used directly.
- */
-extern int tracepoint_probe_register(const char *name, void *probe);
-
-/*
- * Disconnect a probe from a tracepoint.
- * Internal API, should not be used directly.
- */
-extern int tracepoint_probe_unregister(const char *name, void *probe);
-
-extern int tracepoint_probe_register_noupdate(const char *name, void *probe);
-extern int tracepoint_probe_unregister_noupdate(const char *name, void *probe);
-extern void tracepoint_probe_update_all(void);
-
-struct tracepoint_iter {
-       struct module *module;
-       struct tracepoint *tracepoint;
-};
-
-extern void tracepoint_iter_start(struct tracepoint_iter *iter);
-extern void tracepoint_iter_next(struct tracepoint_iter *iter);
-extern void tracepoint_iter_stop(struct tracepoint_iter *iter);
-extern void tracepoint_iter_reset(struct tracepoint_iter *iter);
-extern int tracepoint_get_iter_range(struct tracepoint **tracepoint,
-       struct tracepoint *begin, struct tracepoint *end);
 
 /*
- * tracepoint_synchronize_unregister must be called between the last tracepoint
- * probe unregistration and the end of module exit to make sure there is no
- * caller executing a probe when it is freed.
+ * The need for the DECLARE_TRACE_NOARGS() is to handle the prototype
+ * (void). "void" is a special value in a function prototype and can
+ * not be combined with other arguments. Since the DECLARE_TRACE()
+ * macro adds a data element at the beginning of the prototype,
+ * we need a way to differentiate "(void *data, proto)" from
+ * "(void *data, void)". The second prototype is invalid.
+ *
+ * DECLARE_TRACE_NOARGS() passes "void" as the tracepoint prototype
+ * and "void *__data" as the callback prototype.
+ *
+ * DECLARE_TRACE() passes "proto" as the tracepoint protoype and
+ * "void *__data, proto" as the callback prototype.
  */
-static inline void tracepoint_synchronize_unregister(void)
-{
-       synchronize_sched();
-}
+#define DECLARE_TRACE_NOARGS(name)                                     \
+               __DECLARE_TRACE(name, void, , void *__data, __data)
 
-#define PARAMS(args...) args
-
-#endif /* _LINUX_TRACEPOINT_H */
+#define DECLARE_TRACE(name, proto, args)                               \
+               __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args),      \
+                               PARAMS(void *__data, proto),            \
+                               PARAMS(__data, args))
 
-/*
- * Note: we keep the TRACE_EVENT outside the include file ifdef protection.
- *  This is due to the way trace events work. If a file includes two
- *  trace event headers under one "CREATE_TRACE_POINTS" the first include
- *  will override the TRACE_EVENT and break the second include.
- */
+#endif /* DECLARE_TRACE */
 
 #ifndef TRACE_EVENT
 /*
index a48e16b77d5e2d94c1bb1bfb6b5efa56a82bd7af..76d96d035ea03920ac919a13e2b255e93cc77f35 100644 (file)
@@ -127,12 +127,26 @@ static inline void __add_wait_queue(wait_queue_head_t *head, wait_queue_t *new)
 /*
  * Used for wake-one threads:
  */
+static inline void __add_wait_queue_exclusive(wait_queue_head_t *q,
+                                             wait_queue_t *wait)
+{
+       wait->flags |= WQ_FLAG_EXCLUSIVE;
+       __add_wait_queue(q, wait);
+}
+
 static inline void __add_wait_queue_tail(wait_queue_head_t *head,
-                                               wait_queue_t *new)
+                                        wait_queue_t *new)
 {
        list_add_tail(&new->task_list, &head->task_list);
 }
 
+static inline void __add_wait_queue_tail_exclusive(wait_queue_head_t *q,
+                                             wait_queue_t *wait)
+{
+       wait->flags |= WQ_FLAG_EXCLUSIVE;
+       __add_wait_queue_tail(q, wait);
+}
+
 static inline void __remove_wait_queue(wait_queue_head_t *head,
                                                        wait_queue_t *old)
 {
@@ -403,25 +417,6 @@ do {                                                                       \
        __ret;                                                          \
 })
 
-/*
- * Must be called with the spinlock in the wait_queue_head_t held.
- */
-static inline void add_wait_queue_exclusive_locked(wait_queue_head_t *q,
-                                                  wait_queue_t * wait)
-{
-       wait->flags |= WQ_FLAG_EXCLUSIVE;
-       __add_wait_queue_tail(q,  wait);
-}
-
-/*
- * Must be called with the spinlock in the wait_queue_head_t held.
- */
-static inline void remove_wait_queue_locked(wait_queue_head_t *q,
-                                           wait_queue_t * wait)
-{
-       __remove_wait_queue(q,  wait);
-}
-
 /*
  * These are the old interfaces to sleep waiting for an event.
  * They are racy.  DO NOT use them, use the wait_event* interfaces above.
index 5acfb1eb4df91cd096da3ee728bd636955247b0f..1dfab54015113b83bce9f3302470c3a5ed95b5e7 100644 (file)
 
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
 
+/* Make all open coded DECLARE_TRACE nops */
+#undef DECLARE_TRACE
+#define DECLARE_TRACE(name, proto, args)
+
 #ifdef CONFIG_EVENT_TRACING
 #include <trace/ftrace.h>
 #endif
@@ -75,6 +79,7 @@
 #undef DEFINE_EVENT
 #undef DEFINE_EVENT_PRINT
 #undef TRACE_HEADER_MULTI_READ
+#undef DECLARE_TRACE
 
 /* Only undef what we defined in this file */
 #ifdef UNDEF_TRACE_INCLUDE_FILE
index 4b0f48ba16a688da9ead5b901604419c7823ea3b..c7bb2f0482fec377b1f67edd661d6331441fe7c3 100644 (file)
@@ -51,11 +51,14 @@ TRACE_EVENT(module_free,
        TP_printk("%s", __get_str(name))
 );
 
+#ifdef CONFIG_MODULE_UNLOAD
+/* trace_module_get/put are only used if CONFIG_MODULE_UNLOAD is defined */
+
 DECLARE_EVENT_CLASS(module_refcnt,
 
-       TP_PROTO(struct module *mod, unsigned long ip, int refcnt),
+       TP_PROTO(struct module *mod, unsigned long ip),
 
-       TP_ARGS(mod, ip, refcnt),
+       TP_ARGS(mod, ip),
 
        TP_STRUCT__entry(
                __field(        unsigned long,  ip              )
@@ -65,7 +68,7 @@ DECLARE_EVENT_CLASS(module_refcnt,
 
        TP_fast_assign(
                __entry->ip     = ip;
-               __entry->refcnt = refcnt;
+               __entry->refcnt = __this_cpu_read(mod->refptr->incs) + __this_cpu_read(mod->refptr->decs);
                __assign_str(name, mod->name);
        ),
 
@@ -75,17 +78,18 @@ DECLARE_EVENT_CLASS(module_refcnt,
 
 DEFINE_EVENT(module_refcnt, module_get,
 
-       TP_PROTO(struct module *mod, unsigned long ip, int refcnt),
+       TP_PROTO(struct module *mod, unsigned long ip),
 
-       TP_ARGS(mod, ip, refcnt)
+       TP_ARGS(mod, ip)
 );
 
 DEFINE_EVENT(module_refcnt, module_put,
 
-       TP_PROTO(struct module *mod, unsigned long ip, int refcnt),
+       TP_PROTO(struct module *mod, unsigned long ip),
 
-       TP_ARGS(mod, ip, refcnt)
+       TP_ARGS(mod, ip)
 );
+#endif /* CONFIG_MODULE_UNLOAD */
 
 TRACE_EVENT(module_request,
 
index a8989c4547e7e7b790140957dcaaa3c7917952cd..188deca2f3c7721a1baac60cc07e8d7006442c71 100644 (file)
@@ -1,4 +1,7 @@
-#ifndef _TRACE_NAPI_H_
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM napi
+
+#if !defined(_TRACE_NAPI_H) || defined(TRACE_HEADER_MULTI_READ)
 #define _TRACE_NAPI_H_
 
 #include <linux/netdevice.h>
@@ -8,4 +11,7 @@ DECLARE_TRACE(napi_poll,
        TP_PROTO(struct napi_struct *napi),
        TP_ARGS(napi));
 
-#endif
+#endif /* _TRACE_NAPI_H_ */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
index cfceb0b73e205bb936a6e3fcfeb34f5515ab0feb..4f733ecea46e8cd464d05c92d5610f57c10a59fc 100644 (file)
@@ -51,15 +51,12 @@ TRACE_EVENT(sched_kthread_stop_ret,
 
 /*
  * Tracepoint for waiting on task to unschedule:
- *
- * (NOTE: the 'rq' argument is not used by generic trace events,
- *        but used by the latency tracer plugin. )
  */
 TRACE_EVENT(sched_wait_task,
 
-       TP_PROTO(struct rq *rq, struct task_struct *p),
+       TP_PROTO(struct task_struct *p),
 
-       TP_ARGS(rq, p),
+       TP_ARGS(p),
 
        TP_STRUCT__entry(
                __array(        char,   comm,   TASK_COMM_LEN   )
@@ -79,15 +76,12 @@ TRACE_EVENT(sched_wait_task,
 
 /*
  * Tracepoint for waking up a task:
- *
- * (NOTE: the 'rq' argument is not used by generic trace events,
- *        but used by the latency tracer plugin. )
  */
 DECLARE_EVENT_CLASS(sched_wakeup_template,
 
-       TP_PROTO(struct rq *rq, struct task_struct *p, int success),
+       TP_PROTO(struct task_struct *p, int success),
 
-       TP_ARGS(rq, p, success),
+       TP_ARGS(p, success),
 
        TP_STRUCT__entry(
                __array(        char,   comm,   TASK_COMM_LEN   )
@@ -111,31 +105,25 @@ DECLARE_EVENT_CLASS(sched_wakeup_template,
 );
 
 DEFINE_EVENT(sched_wakeup_template, sched_wakeup,
-            TP_PROTO(struct rq *rq, struct task_struct *p, int success),
-            TP_ARGS(rq, p, success));
+            TP_PROTO(struct task_struct *p, int success),
+            TP_ARGS(p, success));
 
 /*
  * Tracepoint for waking up a new task:
- *
- * (NOTE: the 'rq' argument is not used by generic trace events,
- *        but used by the latency tracer plugin. )
  */
 DEFINE_EVENT(sched_wakeup_template, sched_wakeup_new,
-            TP_PROTO(struct rq *rq, struct task_struct *p, int success),
-            TP_ARGS(rq, p, success));
+            TP_PROTO(struct task_struct *p, int success),
+            TP_ARGS(p, success));
 
 /*
  * Tracepoint for task switches, performed by the scheduler:
- *
- * (NOTE: the 'rq' argument is not used by generic trace events,
- *        but used by the latency tracer plugin. )
  */
 TRACE_EVENT(sched_switch,
 
-       TP_PROTO(struct rq *rq, struct task_struct *prev,
+       TP_PROTO(struct task_struct *prev,
                 struct task_struct *next),
 
-       TP_ARGS(rq, prev, next),
+       TP_ARGS(prev, next),
 
        TP_STRUCT__entry(
                __array(        char,   prev_comm,      TASK_COMM_LEN   )
index a510b75ac304505dbd8771b2b0ffcd101c397c90..814566c99d29eaf53b0410375c5306c1f6997c58 100644 (file)
@@ -100,18 +100,7 @@ TRACE_EVENT(signal_deliver,
                  __entry->sa_handler, __entry->sa_flags)
 );
 
-/**
- * signal_overflow_fail - called when signal queue is overflow
- * @sig: signal number
- * @group: signal to process group or not (bool)
- * @info: pointer to struct siginfo
- *
- * Kernel fails to generate 'sig' signal with 'info' siginfo, because
- * siginfo queue is overflow, and the signal is dropped.
- * 'group' is not 0 if the signal will be sent to a process group.
- * 'sig' is always one of RT signals.
- */
-TRACE_EVENT(signal_overflow_fail,
+DECLARE_EVENT_CLASS(signal_queue_overflow,
 
        TP_PROTO(int sig, int group, struct siginfo *info),
 
@@ -134,6 +123,24 @@ TRACE_EVENT(signal_overflow_fail,
                  __entry->sig, __entry->group, __entry->errno, __entry->code)
 );
 
+/**
+ * signal_overflow_fail - called when signal queue is overflow
+ * @sig: signal number
+ * @group: signal to process group or not (bool)
+ * @info: pointer to struct siginfo
+ *
+ * Kernel fails to generate 'sig' signal with 'info' siginfo, because
+ * siginfo queue is overflow, and the signal is dropped.
+ * 'group' is not 0 if the signal will be sent to a process group.
+ * 'sig' is always one of RT signals.
+ */
+DEFINE_EVENT(signal_queue_overflow, signal_overflow_fail,
+
+       TP_PROTO(int sig, int group, struct siginfo *info),
+
+       TP_ARGS(sig, group, info)
+);
+
 /**
  * signal_lose_info - called when siginfo is lost
  * @sig: signal number
@@ -145,28 +152,13 @@ TRACE_EVENT(signal_overflow_fail,
  * 'group' is not 0 if the signal will be sent to a process group.
  * 'sig' is always one of non-RT signals.
  */
-TRACE_EVENT(signal_lose_info,
+DEFINE_EVENT(signal_queue_overflow, signal_lose_info,
 
        TP_PROTO(int sig, int group, struct siginfo *info),
 
-       TP_ARGS(sig, group, info),
-
-       TP_STRUCT__entry(
-               __field(        int,    sig     )
-               __field(        int,    group   )
-               __field(        int,    errno   )
-               __field(        int,    code    )
-       ),
-
-       TP_fast_assign(
-               __entry->sig    = sig;
-               __entry->group  = group;
-               TP_STORE_SIGINFO(__entry, info);
-       ),
-
-       TP_printk("sig=%d group=%d errno=%d code=%d",
-                 __entry->sig, __entry->group, __entry->errno, __entry->code)
+       TP_ARGS(sig, group, info)
 );
+
 #endif /* _TRACE_SIGNAL_H */
 
 /* This part must be outside protection */
index 882c64832ffe07b774df0d8932eadc8b919ea51a..e0e8daa6767e052d1a95b1bf1c95016de8049e85 100644 (file)
                struct trace_entry      ent;                            \
                tstruct                                                 \
                char                    __data[0];                      \
-       };
+       };                                                              \
+                                                                       \
+       static struct ftrace_event_class event_class_##name;
+
 #undef DEFINE_EVENT
 #define DEFINE_EVENT(template, name, proto, args)      \
        static struct ftrace_event_call                 \
  *
  *     entry = iter->ent;
  *
- *     if (entry->type != event_<call>.id) {
+ *     if (entry->type != event_<call>->event.type) {
  *             WARN_ON_ONCE(1);
  *             return TRACE_TYPE_UNHANDLED;
  *     }
  *
  *     field = (typeof(field))entry;
  *
- *     p = get_cpu_var(ftrace_event_seq);
+ *     p = &get_cpu_var(ftrace_event_seq);
  *     trace_seq_init(p);
- *     ret = trace_seq_printf(s, <TP_printk> "\n");
+ *     ret = trace_seq_printf(s, "%s: ", <call>);
+ *     if (ret)
+ *             ret = trace_seq_printf(s, <TP_printk> "\n");
  *     put_cpu();
  *     if (!ret)
  *             return TRACE_TYPE_PARTIAL_LINE;
 #undef DECLARE_EVENT_CLASS
 #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \
 static notrace enum print_line_t                                       \
-ftrace_raw_output_id_##call(int event_id, const char *name,            \
-                           struct trace_iterator *iter, int flags)     \
+ftrace_raw_output_##call(struct trace_iterator *iter, int flags,       \
+                        struct trace_event *trace_event)               \
 {                                                                      \
+       struct ftrace_event_call *event;                                \
        struct trace_seq *s = &iter->seq;                               \
        struct ftrace_raw_##call *field;                                \
        struct trace_entry *entry;                                      \
        struct trace_seq *p;                                            \
        int ret;                                                        \
                                                                        \
+       event = container_of(trace_event, struct ftrace_event_call,     \
+                            event);                                    \
+                                                                       \
        entry = iter->ent;                                              \
                                                                        \
-       if (entry->type != event_id) {                                  \
+       if (entry->type != event->event.type) {                         \
                WARN_ON_ONCE(1);                                        \
                return TRACE_TYPE_UNHANDLED;                            \
        }                                                               \
@@ -221,7 +230,7 @@ ftrace_raw_output_id_##call(int event_id, const char *name,         \
                                                                        \
        p = &get_cpu_var(ftrace_event_seq);                             \
        trace_seq_init(p);                                              \
-       ret = trace_seq_printf(s, "%s: ", name);                        \
+       ret = trace_seq_printf(s, "%s: ", event->name);                 \
        if (ret)                                                        \
                ret = trace_seq_printf(s, print);                       \
        put_cpu();                                                      \
@@ -229,21 +238,16 @@ ftrace_raw_output_id_##call(int event_id, const char *name,               \
                return TRACE_TYPE_PARTIAL_LINE;                         \
                                                                        \
        return TRACE_TYPE_HANDLED;                                      \
-}
-
-#undef DEFINE_EVENT
-#define DEFINE_EVENT(template, name, proto, args)                      \
-static notrace enum print_line_t                                       \
-ftrace_raw_output_##name(struct trace_iterator *iter, int flags)       \
-{                                                                      \
-       return ftrace_raw_output_id_##template(event_##name.id,         \
-                                              #name, iter, flags);     \
-}
+}                                                                      \
+static struct trace_event_functions ftrace_event_type_funcs_##call = { \
+       .trace                  = ftrace_raw_output_##call,             \
+};
 
 #undef DEFINE_EVENT_PRINT
 #define DEFINE_EVENT_PRINT(template, call, proto, args, print)         \
 static notrace enum print_line_t                                       \
-ftrace_raw_output_##call(struct trace_iterator *iter, int flags)       \
+ftrace_raw_output_##call(struct trace_iterator *iter, int flags,       \
+                        struct trace_event *event)                     \
 {                                                                      \
        struct trace_seq *s = &iter->seq;                               \
        struct ftrace_raw_##template *field;                            \
@@ -253,7 +257,7 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags)    \
                                                                        \
        entry = iter->ent;                                              \
                                                                        \
-       if (entry->type != event_##call.id) {                           \
+       if (entry->type != event_##call.event.type) {                   \
                WARN_ON_ONCE(1);                                        \
                return TRACE_TYPE_UNHANDLED;                            \
        }                                                               \
@@ -270,7 +274,10 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags)   \
                return TRACE_TYPE_PARTIAL_LINE;                         \
                                                                        \
        return TRACE_TYPE_HANDLED;                                      \
-}
+}                                                                      \
+static struct trace_event_functions ftrace_event_type_funcs_##call = { \
+       .trace                  = ftrace_raw_output_##call,             \
+};
 
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
 
@@ -376,142 +383,83 @@ static inline notrace int ftrace_get_offsets_##call(                     \
 
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
 
-#ifdef CONFIG_PERF_EVENTS
-
-/*
- * Generate the functions needed for tracepoint perf_event support.
- *
- * NOTE: The insertion profile callback (ftrace_profile_<call>) is defined later
- *
- * static int ftrace_profile_enable_<call>(void)
- * {
- *     return register_trace_<call>(ftrace_profile_<call>);
- * }
- *
- * static void ftrace_profile_disable_<call>(void)
- * {
- *     unregister_trace_<call>(ftrace_profile_<call>);
- * }
- *
- */
-
-#undef DECLARE_EVENT_CLASS
-#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print)
-
-#undef DEFINE_EVENT
-#define DEFINE_EVENT(template, name, proto, args)                      \
-                                                                       \
-static void perf_trace_##name(proto);                                  \
-                                                                       \
-static notrace int                                                     \
-perf_trace_enable_##name(struct ftrace_event_call *unused)             \
-{                                                                      \
-       return register_trace_##name(perf_trace_##name);                \
-}                                                                      \
-                                                                       \
-static notrace void                                                    \
-perf_trace_disable_##name(struct ftrace_event_call *unused)            \
-{                                                                      \
-       unregister_trace_##name(perf_trace_##name);                     \
-}
-
-#undef DEFINE_EVENT_PRINT
-#define DEFINE_EVENT_PRINT(template, name, proto, args, print) \
-       DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args))
-
-#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
-
-#endif /* CONFIG_PERF_EVENTS */
-
 /*
  * Stage 4 of the trace events.
  *
  * Override the macros in <trace/trace_events.h> to include the following:
  *
- * static void ftrace_event_<call>(proto)
- * {
- *     event_trace_printk(_RET_IP_, "<call>: " <fmt>);
- * }
- *
- * static int ftrace_reg_event_<call>(struct ftrace_event_call *unused)
- * {
- *     return register_trace_<call>(ftrace_event_<call>);
- * }
- *
- * static void ftrace_unreg_event_<call>(struct ftrace_event_call *unused)
- * {
- *     unregister_trace_<call>(ftrace_event_<call>);
- * }
- *
- *
  * For those macros defined with TRACE_EVENT:
  *
  * static struct ftrace_event_call event_<call>;
  *
- * static void ftrace_raw_event_<call>(proto)
+ * static void ftrace_raw_event_<call>(void *__data, proto)
  * {
+ *     struct ftrace_event_call *event_call = __data;
+ *     struct ftrace_data_offsets_<call> __maybe_unused __data_offsets;
  *     struct ring_buffer_event *event;
  *     struct ftrace_raw_<call> *entry; <-- defined in stage 1
  *     struct ring_buffer *buffer;
  *     unsigned long irq_flags;
+ *     int __data_size;
  *     int pc;
  *
  *     local_save_flags(irq_flags);
  *     pc = preempt_count();
  *
+ *     __data_size = ftrace_get_offsets_<call>(&__data_offsets, args);
+ *
  *     event = trace_current_buffer_lock_reserve(&buffer,
- *                               event_<call>.id,
- *                               sizeof(struct ftrace_raw_<call>),
+ *                               event_<call>->event.type,
+ *                               sizeof(*entry) + __data_size,
  *                               irq_flags, pc);
  *     if (!event)
  *             return;
  *     entry   = ring_buffer_event_data(event);
  *
- *     <assign>;  <-- Here we assign the entries by the __field and
- *                     __array macros.
- *
- *     trace_current_buffer_unlock_commit(buffer, event, irq_flags, pc);
- * }
- *
- * static int ftrace_raw_reg_event_<call>(struct ftrace_event_call *unused)
- * {
- *     int ret;
- *
- *     ret = register_trace_<call>(ftrace_raw_event_<call>);
- *     if (!ret)
- *             pr_info("event trace: Could not activate trace point "
- *                     "probe to <call>");
- *     return ret;
- * }
+ *     { <assign>; }  <-- Here we assign the entries by the __field and
+ *                        __array macros.
  *
- * static void ftrace_unreg_event_<call>(struct ftrace_event_call *unused)
- * {
- *     unregister_trace_<call>(ftrace_raw_event_<call>);
+ *     if (!filter_current_check_discard(buffer, event_call, entry, event))
+ *             trace_current_buffer_unlock_commit(buffer,
+ *                                                event, irq_flags, pc);
  * }
  *
  * static struct trace_event ftrace_event_type_<call> = {
  *     .trace                  = ftrace_raw_output_<call>, <-- stage 2
  * };
  *
+ * static const char print_fmt_<call>[] = <TP_printk>;
+ *
+ * static struct ftrace_event_class __used event_class_<template> = {
+ *     .system                 = "<system>",
+ *     .define_fields          = ftrace_define_fields_<call>,
+ *     .fields                 = LIST_HEAD_INIT(event_class_##call.fields),
+ *     .raw_init               = trace_event_raw_init,
+ *     .probe                  = ftrace_raw_event_##call,
+ * };
+ *
  * static struct ftrace_event_call __used
  * __attribute__((__aligned__(4)))
  * __attribute__((section("_ftrace_events"))) event_<call> = {
  *     .name                   = "<call>",
- *     .system                 = "<system>",
- *     .raw_init               = trace_event_raw_init,
- *     .regfunc                = ftrace_reg_event_<call>,
- *     .unregfunc              = ftrace_unreg_event_<call>,
- * }
+ *     .class                  = event_class_<template>,
+ *     .event                  = &ftrace_event_type_<call>,
+ *     .print_fmt              = print_fmt_<call>,
+ * };
  *
  */
 
 #ifdef CONFIG_PERF_EVENTS
 
+#define _TRACE_PERF_PROTO(call, proto)                                 \
+       static notrace void                                             \
+       perf_trace_##call(void *__data, proto);
+
 #define _TRACE_PERF_INIT(call)                                         \
-       .perf_event_enable = perf_trace_enable_##call,                  \
-       .perf_event_disable = perf_trace_disable_##call,
+       .perf_probe             = perf_trace_##call,
 
 #else
+#define _TRACE_PERF_PROTO(call, proto)
 #define _TRACE_PERF_INIT(call)
 #endif /* CONFIG_PERF_EVENTS */
 
@@ -545,9 +493,9 @@ perf_trace_disable_##name(struct ftrace_event_call *unused)         \
 #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \
                                                                        \
 static notrace void                                                    \
-ftrace_raw_event_id_##call(struct ftrace_event_call *event_call,       \
-                                      proto)                           \
+ftrace_raw_event_##call(void *__data, proto)                           \
 {                                                                      \
+       struct ftrace_event_call *event_call = __data;                  \
        struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\
        struct ring_buffer_event *event;                                \
        struct ftrace_raw_##call *entry;                                \
@@ -562,14 +510,13 @@ ftrace_raw_event_id_##call(struct ftrace_event_call *event_call,  \
        __data_size = ftrace_get_offsets_##call(&__data_offsets, args); \
                                                                        \
        event = trace_current_buffer_lock_reserve(&buffer,              \
-                                event_call->id,                        \
+                                event_call->event.type,                \
                                 sizeof(*entry) + __data_size,          \
                                 irq_flags, pc);                        \
        if (!event)                                                     \
                return;                                                 \
        entry   = ring_buffer_event_data(event);                        \
                                                                        \
-                                                                       \
        tstruct                                                         \
                                                                        \
        { assign; }                                                     \
@@ -578,34 +525,21 @@ ftrace_raw_event_id_##call(struct ftrace_event_call *event_call,  \
                trace_nowake_buffer_unlock_commit(buffer,               \
                                                  event, irq_flags, pc); \
 }
+/*
+ * The ftrace_test_probe is compiled out, it is only here as a build time check
+ * to make sure that if the tracepoint handling changes, the ftrace probe will
+ * fail to compile unless it too is updated.
+ */
 
 #undef DEFINE_EVENT
 #define DEFINE_EVENT(template, call, proto, args)                      \
-                                                                       \
-static notrace void ftrace_raw_event_##call(proto)                     \
-{                                                                      \
-       ftrace_raw_event_id_##template(&event_##call, args);            \
-}                                                                      \
-                                                                       \
-static notrace int                                                     \
-ftrace_raw_reg_event_##call(struct ftrace_event_call *unused)          \
-{                                                                      \
-       return register_trace_##call(ftrace_raw_event_##call);          \
-}                                                                      \
-                                                                       \
-static notrace void                                                    \
-ftrace_raw_unreg_event_##call(struct ftrace_event_call *unused)                \
+static inline void ftrace_test_probe_##call(void)                      \
 {                                                                      \
-       unregister_trace_##call(ftrace_raw_event_##call);               \
-}                                                                      \
-                                                                       \
-static struct trace_event ftrace_event_type_##call = {                 \
-       .trace                  = ftrace_raw_output_##call,             \
-};
+       check_trace_callback_type_##call(ftrace_raw_event_##template);  \
+}
 
 #undef DEFINE_EVENT_PRINT
-#define DEFINE_EVENT_PRINT(template, name, proto, args, print) \
-       DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args))
+#define DEFINE_EVENT_PRINT(template, name, proto, args, print)
 
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
 
@@ -622,7 +556,16 @@ static struct trace_event ftrace_event_type_##call = {                     \
 
 #undef DECLARE_EVENT_CLASS
 #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \
-static const char print_fmt_##call[] = print;
+_TRACE_PERF_PROTO(call, PARAMS(proto));                                        \
+static const char print_fmt_##call[] = print;                          \
+static struct ftrace_event_class __used event_class_##call = {         \
+       .system                 = __stringify(TRACE_SYSTEM),            \
+       .define_fields          = ftrace_define_fields_##call,          \
+       .fields                 = LIST_HEAD_INIT(event_class_##call.fields),\
+       .raw_init               = trace_event_raw_init,                 \
+       .probe                  = ftrace_raw_event_##call,              \
+       _TRACE_PERF_INIT(call)                                          \
+};
 
 #undef DEFINE_EVENT
 #define DEFINE_EVENT(template, call, proto, args)                      \
@@ -631,15 +574,10 @@ static struct ftrace_event_call __used                                    \
 __attribute__((__aligned__(4)))                                                \
 __attribute__((section("_ftrace_events"))) event_##call = {            \
        .name                   = #call,                                \
-       .system                 = __stringify(TRACE_SYSTEM),            \
-       .event                  = &ftrace_event_type_##call,            \
-       .raw_init               = trace_event_raw_init,                 \
-       .regfunc                = ftrace_raw_reg_event_##call,          \
-       .unregfunc              = ftrace_raw_unreg_event_##call,        \
+       .class                  = &event_class_##template,              \
+       .event.funcs            = &ftrace_event_type_funcs_##template,  \
        .print_fmt              = print_fmt_##template,                 \
-       .define_fields          = ftrace_define_fields_##template,      \
-       _TRACE_PERF_INIT(call)                                  \
-}
+};
 
 #undef DEFINE_EVENT_PRINT
 #define DEFINE_EVENT_PRINT(template, call, proto, args, print)         \
@@ -650,14 +588,9 @@ static struct ftrace_event_call __used                                     \
 __attribute__((__aligned__(4)))                                                \
 __attribute__((section("_ftrace_events"))) event_##call = {            \
        .name                   = #call,                                \
-       .system                 = __stringify(TRACE_SYSTEM),            \
-       .event                  = &ftrace_event_type_##call,            \
-       .raw_init               = trace_event_raw_init,                 \
-       .regfunc                = ftrace_raw_reg_event_##call,          \
-       .unregfunc              = ftrace_raw_unreg_event_##call,        \
+       .class                  = &event_class_##template,              \
+       .event.funcs            = &ftrace_event_type_funcs_##call,      \
        .print_fmt              = print_fmt_##call,                     \
-       .define_fields          = ftrace_define_fields_##template,      \
-       _TRACE_PERF_INIT(call)                                  \
 }
 
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
@@ -757,17 +690,20 @@ __attribute__((section("_ftrace_events"))) event_##call = {               \
 #undef DECLARE_EVENT_CLASS
 #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \
 static notrace void                                                    \
-perf_trace_templ_##call(struct ftrace_event_call *event_call,          \
-                       struct pt_regs *__regs, proto)                  \
+perf_trace_##call(void *__data, proto)                                 \
 {                                                                      \
+       struct ftrace_event_call *event_call = __data;                  \
        struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\
        struct ftrace_raw_##call *entry;                                \
+       struct pt_regs *__regs = &get_cpu_var(perf_trace_regs);         \
        u64 __addr = 0, __count = 1;                                    \
        unsigned long irq_flags;                                        \
        int __entry_size;                                               \
        int __data_size;                                                \
        int rctx;                                                       \
                                                                        \
+       perf_fetch_caller_regs(__regs, 1);                              \
+                                                                       \
        __data_size = ftrace_get_offsets_##call(&__data_offsets, args); \
        __entry_size = ALIGN(__data_size + sizeof(*entry) + sizeof(u32),\
                             sizeof(u64));                              \
@@ -775,33 +711,35 @@ perf_trace_templ_##call(struct ftrace_event_call *event_call,             \
                                                                        \
        if (WARN_ONCE(__entry_size > PERF_MAX_TRACE_SIZE,               \
                      "profile buffer not large enough"))               \
-               return;                                                 \
+               goto out;                                               \
        entry = (struct ftrace_raw_##call *)perf_trace_buf_prepare(     \
-               __entry_size, event_call->id, &rctx, &irq_flags);       \
+               __entry_size, event_call->event.type, &rctx, &irq_flags); \
        if (!entry)                                                     \
-               return;                                                 \
+               goto out;                                               \
        tstruct                                                         \
                                                                        \
        { assign; }                                                     \
                                                                        \
        perf_trace_buf_submit(entry, __entry_size, rctx, __addr,        \
                               __count, irq_flags, __regs);             \
+ out:                                                                  \
+       put_cpu_var(perf_trace_regs);                                   \
 }
 
+/*
+ * This part is compiled out, it is only here as a build time check
+ * to make sure that if the tracepoint handling changes, the
+ * perf probe will fail to compile unless it too is updated.
+ */
 #undef DEFINE_EVENT
 #define DEFINE_EVENT(template, call, proto, args)                      \
-static notrace void perf_trace_##call(proto)                           \
+static inline void perf_test_probe_##call(void)                                \
 {                                                                      \
-       struct ftrace_event_call *event_call = &event_##call;           \
-       struct pt_regs *__regs = &get_cpu_var(perf_trace_regs);         \
-                                                                       \
-       perf_fetch_caller_regs(__regs, 1);                              \
+       check_trace_callback_type_##call(perf_trace_##template);        \
                                                                        \
-       perf_trace_templ_##template(event_call, __regs, args);          \
-                                                                       \
-       put_cpu_var(perf_trace_regs);                                   \
 }
 
+
 #undef DEFINE_EVENT_PRINT
 #define DEFINE_EVENT_PRINT(template, name, proto, args, print) \
        DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args))
index e5e5f48dbfb3f0a378cd1cc4dcfdcc15b7eb308e..257e08960d7b7f1c232ca6cef97f1cac3f0737e5 100644 (file)
@@ -25,6 +25,8 @@ struct syscall_metadata {
        int             nb_args;
        const char      **types;
        const char      **args;
+       struct list_head enter_fields;
+       struct list_head exit_fields;
 
        struct ftrace_event_call *enter_event;
        struct ftrace_event_call *exit_event;
@@ -34,16 +36,16 @@ struct syscall_metadata {
 extern unsigned long arch_syscall_addr(int nr);
 extern int init_syscall_trace(struct ftrace_event_call *call);
 
-extern int syscall_enter_define_fields(struct ftrace_event_call *call);
-extern int syscall_exit_define_fields(struct ftrace_event_call *call);
 extern int reg_event_syscall_enter(struct ftrace_event_call *call);
 extern void unreg_event_syscall_enter(struct ftrace_event_call *call);
 extern int reg_event_syscall_exit(struct ftrace_event_call *call);
 extern void unreg_event_syscall_exit(struct ftrace_event_call *call);
 extern int
 ftrace_format_syscall(struct ftrace_event_call *call, struct trace_seq *s);
-enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags);
-enum print_line_t print_syscall_exit(struct trace_iterator *iter, int flags);
+enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags,
+                                     struct trace_event *event);
+enum print_line_t print_syscall_exit(struct trace_iterator *iter, int flags,
+                                    struct trace_event *event);
 #endif
 
 #ifdef CONFIG_PERF_EVENTS
index eb77e8ccde1c47d979cec79ec7c0fd1a12091161..5fe94b82e4c0a7ee5b779b58b8db09c162533f51 100644 (file)
@@ -604,8 +604,7 @@ config RT_GROUP_SCHED
        default n
        help
          This feature lets you explicitly allocate real CPU bandwidth
-         to users or control groups (depending on the "Basis for grouping tasks"
-         setting below. If enabled, it will also make it impossible to
+         to task groups. If enabled, it will also make it impossible to
          schedule realtime tasks for non-root users until you allocate
          realtime bandwidth for them.
          See Documentation/scheduler/sched-rt-group.txt for more information.
index a987aa1676b594b5501efd7213ccf671318c94c9..149e18ef1ab14f1f82c419d5d40e87dcff28fab5 100644 (file)
@@ -68,7 +68,7 @@ obj-$(CONFIG_USER_NS) += user_namespace.o
 obj-$(CONFIG_PID_NS) += pid_namespace.o
 obj-$(CONFIG_IKCONFIG) += configs.o
 obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
-obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
+obj-$(CONFIG_SMP) += stop_machine.o
 obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
 obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o
 obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
index 9e4697e9b276e7429fed888b76c8e4ee19562e4c..2f05303715a5c4066a04b128251ebeec08d3779b 100644 (file)
@@ -15,7 +15,6 @@
 #include <linux/syscalls.h>
 #include <linux/pid_namespace.h>
 #include <asm/uaccess.h>
-#include "cred-internals.h"
 
 /*
  * Leveraged for setting/resetting capabilities
index e2769e13980c49b2546d0bb2c6cef1e0065f6ca7..4a07d057a265f644ad73a56a7e02cac1def8cdce 100644 (file)
@@ -3010,7 +3010,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
        unsigned long flags = (unsigned long)key;
 
        if (flags & POLLHUP) {
-               remove_wait_queue_locked(event->wqh, &event->wait);
+               __remove_wait_queue(event->wqh, &event->wait);
                spin_lock(&cgrp->event_list_lock);
                list_del(&event->list);
                spin_unlock(&cgrp->event_list_lock);
index 25bba73b1be3a67fe1ce0f932f9d833c81b245f5..545777574779da71ef8e3fcc3935e0e41fa266ca 100644 (file)
@@ -164,6 +164,7 @@ static inline void check_for_tasks(int cpu)
 }
 
 struct take_cpu_down_param {
+       struct task_struct *caller;
        unsigned long mod;
        void *hcpu;
 };
@@ -172,6 +173,7 @@ struct take_cpu_down_param {
 static int __ref take_cpu_down(void *_param)
 {
        struct take_cpu_down_param *param = _param;
+       unsigned int cpu = (unsigned long)param->hcpu;
        int err;
 
        /* Ensure this CPU doesn't handle any more interrupts. */
@@ -182,6 +184,8 @@ static int __ref take_cpu_down(void *_param)
        raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod,
                                param->hcpu);
 
+       if (task_cpu(param->caller) == cpu)
+               move_task_off_dead_cpu(cpu, param->caller);
        /* Force idle task to run as soon as we yield: it should
           immediately notice cpu is offline and die quickly. */
        sched_idle_next();
@@ -192,10 +196,10 @@ static int __ref take_cpu_down(void *_param)
 static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 {
        int err, nr_calls = 0;
-       cpumask_var_t old_allowed;
        void *hcpu = (void *)(long)cpu;
        unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
        struct take_cpu_down_param tcd_param = {
+               .caller = current,
                .mod = mod,
                .hcpu = hcpu,
        };
@@ -206,9 +210,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
        if (!cpu_online(cpu))
                return -EINVAL;
 
-       if (!alloc_cpumask_var(&old_allowed, GFP_KERNEL))
-               return -ENOMEM;
-
        cpu_hotplug_begin();
        set_cpu_active(cpu, false);
        err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
@@ -225,10 +226,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
                goto out_release;
        }
 
-       /* Ensure that we are not runnable on dying cpu */
-       cpumask_copy(old_allowed, &current->cpus_allowed);
-       set_cpus_allowed_ptr(current, cpu_active_mask);
-
        err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
        if (err) {
                set_cpu_active(cpu, true);
@@ -237,7 +234,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
                                            hcpu) == NOTIFY_BAD)
                        BUG();
 
-               goto out_allowed;
+               goto out_release;
        }
        BUG_ON(cpu_online(cpu));
 
@@ -255,8 +252,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 
        check_for_tasks(cpu);
 
-out_allowed:
-       set_cpus_allowed_ptr(current, old_allowed);
 out_release:
        cpu_hotplug_done();
        if (!err) {
@@ -264,7 +259,6 @@ out_release:
                                            hcpu) == NOTIFY_BAD)
                        BUG();
        }
-       free_cpumask_var(old_allowed);
        return err;
 }
 
@@ -272,9 +266,6 @@ int __ref cpu_down(unsigned int cpu)
 {
        int err;
 
-       err = stop_machine_create();
-       if (err)
-               return err;
        cpu_maps_update_begin();
 
        if (cpu_hotplug_disabled) {
@@ -286,7 +277,6 @@ int __ref cpu_down(unsigned int cpu)
 
 out:
        cpu_maps_update_done();
-       stop_machine_destroy();
        return err;
 }
 EXPORT_SYMBOL(cpu_down);
@@ -367,9 +357,6 @@ int disable_nonboot_cpus(void)
 {
        int cpu, first_cpu, error;
 
-       error = stop_machine_create();
-       if (error)
-               return error;
        cpu_maps_update_begin();
        first_cpu = cpumask_first(cpu_online_mask);
        /*
@@ -400,7 +387,6 @@ int disable_nonboot_cpus(void)
                printk(KERN_ERR "Non-boot CPUs are not disabled\n");
        }
        cpu_maps_update_done();
-       stop_machine_destroy();
        return error;
 }
 
index d10946748ec2a3c9a070301f4af78c5f4dd18845..9a50c5f6e727f3f77ec5dcf2d5f993fc62717ce9 100644 (file)
@@ -2182,19 +2182,52 @@ void __init cpuset_init_smp(void)
 void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
 {
        mutex_lock(&callback_mutex);
-       cpuset_cpus_allowed_locked(tsk, pmask);
+       task_lock(tsk);
+       guarantee_online_cpus(task_cs(tsk), pmask);
+       task_unlock(tsk);
        mutex_unlock(&callback_mutex);
 }
 
-/**
- * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset.
- * Must be called with callback_mutex held.
- **/
-void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask)
+int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
 {
-       task_lock(tsk);
-       guarantee_online_cpus(task_cs(tsk), pmask);
-       task_unlock(tsk);
+       const struct cpuset *cs;
+       int cpu;
+
+       rcu_read_lock();
+       cs = task_cs(tsk);
+       if (cs)
+               cpumask_copy(&tsk->cpus_allowed, cs->cpus_allowed);
+       rcu_read_unlock();
+
+       /*
+        * We own tsk->cpus_allowed, nobody can change it under us.
+        *
+        * But we used cs && cs->cpus_allowed lockless and thus can
+        * race with cgroup_attach_task() or update_cpumask() and get
+        * the wrong tsk->cpus_allowed. However, both cases imply the
+        * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr()
+        * which takes task_rq_lock().
+        *
+        * If we are called after it dropped the lock we must see all
+        * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
+        * set any mask even if it is not right from task_cs() pov,
+        * the pending set_cpus_allowed_ptr() will fix things.
+        */
+
+       cpu = cpumask_any_and(&tsk->cpus_allowed, cpu_active_mask);
+       if (cpu >= nr_cpu_ids) {
+               /*
+                * Either tsk->cpus_allowed is wrong (see above) or it
+                * is actually empty. The latter case is only possible
+                * if we are racing with remove_tasks_in_empty_cpuset().
+                * Like above we can temporary set any mask and rely on
+                * set_cpus_allowed_ptr() as synchronization point.
+                */
+               cpumask_copy(&tsk->cpus_allowed, cpu_possible_mask);
+               cpu = cpumask_any(cpu_active_mask);
+       }
+
+       return cpu;
 }
 
 void cpuset_init_current_mems_allowed(void)
@@ -2382,22 +2415,6 @@ int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
        return 0;
 }
 
-/**
- * cpuset_lock - lock out any changes to cpuset structures
- *
- * The out of memory (oom) code needs to mutex_lock cpusets
- * from being changed while it scans the tasklist looking for a
- * task in an overlapping cpuset.  Expose callback_mutex via this
- * cpuset_lock() routine, so the oom code can lock it, before
- * locking the task list.  The tasklist_lock is a spinlock, so
- * must be taken inside callback_mutex.
- */
-
-void cpuset_lock(void)
-{
-       mutex_lock(&callback_mutex);
-}
-
 /**
  * cpuset_unlock - release lock on cpuset changes
  *
diff --git a/kernel/cred-internals.h b/kernel/cred-internals.h
deleted file mode 100644 (file)
index 2dc4fc2..0000000
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Internal credentials stuff
- *
- * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-
-/*
- * user.c
- */
-static inline void sched_switch_user(struct task_struct *p)
-{
-#ifdef CONFIG_USER_SCHED
-       sched_move_task(p);
-#endif /* CONFIG_USER_SCHED */
-}
-
index 62af1816c2352eaa9238887ef567a0b62ab5426f..8f3672a58a1e82a39de67ee4f35e7b2f0c12ac2b 100644 (file)
@@ -17,7 +17,6 @@
 #include <linux/init_task.h>
 #include <linux/security.h>
 #include <linux/cn_proc.h>
-#include "cred-internals.h"
 
 #if 0
 #define kdebug(FMT, ...) \
@@ -560,8 +559,6 @@ int commit_creds(struct cred *new)
                atomic_dec(&old->user->processes);
        alter_cred_subscribers(old, -2);
 
-       sched_switch_user(task);
-
        /* send notifications */
        if (new->uid   != old->uid  ||
            new->euid  != old->euid ||
index 7f2683a10ac40d2682a4a400091ec61bd2131739..eabca5a73a85b70d8d1d9f2ea95c05ae6c27c1e2 100644 (file)
@@ -55,7 +55,6 @@
 #include <asm/unistd.h>
 #include <asm/pgtable.h>
 #include <asm/mmu_context.h>
-#include "cred-internals.h"
 
 static void exit_mm(struct task_struct * tsk);
 
index 1016b75b026ab61b7ef0ee233915c3b1adeaf64d..e2564580f3f113ec9d391a7ad45beea842d39620 100644 (file)
@@ -59,8 +59,6 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/module.h>
 
-EXPORT_TRACEPOINT_SYMBOL(module_get);
-
 #if 0
 #define DEBUGP printk
 #else
@@ -515,6 +513,9 @@ MODINFO_ATTR(srcversion);
 static char last_unloaded_module[MODULE_NAME_LEN+1];
 
 #ifdef CONFIG_MODULE_UNLOAD
+
+EXPORT_TRACEPOINT_SYMBOL(module_get);
+
 /* Init the unload section of the module. */
 static void module_unload_init(struct module *mod)
 {
@@ -723,16 +724,8 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
                return -EFAULT;
        name[MODULE_NAME_LEN-1] = '\0';
 
-       /* Create stop_machine threads since free_module relies on
-        * a non-failing stop_machine call. */
-       ret = stop_machine_create();
-       if (ret)
-               return ret;
-
-       if (mutex_lock_interruptible(&module_mutex) != 0) {
-               ret = -EINTR;
-               goto out_stop;
-       }
+       if (mutex_lock_interruptible(&module_mutex) != 0)
+               return -EINTR;
 
        mod = find_module(name);
        if (!mod) {
@@ -792,8 +785,6 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
 
  out:
        mutex_unlock(&module_mutex);
-out_stop:
-       stop_machine_destroy();
        return ret;
 }
 
@@ -867,8 +858,7 @@ void module_put(struct module *module)
                smp_wmb(); /* see comment in module_refcount */
                __this_cpu_inc(module->refptr->decs);
 
-               trace_module_put(module, _RET_IP_,
-                                __this_cpu_read(module->refptr->decs));
+               trace_module_put(module, _RET_IP_);
                /* Maybe they're waiting for us to drop reference? */
                if (unlikely(!module_is_live(module)))
                        wake_up_process(module->waiter);
index 58df55bf83ed919ae4d6372ef4769c0768310a03..2b676f3a0f2621953f37be4c8bd5232f7e17c6f2 100644 (file)
@@ -669,7 +669,7 @@ static struct rcu_torture_ops sched_expedited_ops = {
        .sync           = synchronize_sched_expedited,
        .cb_barrier     = NULL,
        .fqs            = rcu_sched_force_quiescent_state,
-       .stats          = rcu_expedited_torture_stats,
+       .stats          = NULL,
        .irq_capable    = 1,
        .name           = "sched_expedited"
 };
index b11b80a3eed36335c5a9ae24ffd8e9e2540114b8..78554dd0d1a4181e585f11b329ed6cf85d3f88e7 100644 (file)
@@ -55,9 +55,9 @@
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
 #include <linux/percpu.h>
-#include <linux/kthread.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
+#include <linux/stop_machine.h>
 #include <linux/sysctl.h>
 #include <linux/syscalls.h>
 #include <linux/times.h>
@@ -503,8 +503,11 @@ struct rq {
        #define CPU_LOAD_IDX_MAX 5
        unsigned long cpu_load[CPU_LOAD_IDX_MAX];
 #ifdef CONFIG_NO_HZ
+       u64 nohz_stamp;
        unsigned char in_nohz_recently;
 #endif
+       unsigned int skip_clock_update;
+
        /* capture load from *all* tasks on this cpu: */
        struct load_weight load;
        unsigned long nr_load_updates;
@@ -546,15 +549,13 @@ struct rq {
        int post_schedule;
        int active_balance;
        int push_cpu;
+       struct cpu_stop_work active_balance_work;
        /* cpu of this runqueue: */
        int cpu;
        int online;
 
        unsigned long avg_load_per_task;
 
-       struct task_struct *migration_thread;
-       struct list_head migration_queue;
-
        u64 rt_avg;
        u64 age_stamp;
        u64 idle_stamp;
@@ -602,6 +603,13 @@ static inline
 void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
 {
        rq->curr->sched_class->check_preempt_curr(rq, p, flags);
+
+       /*
+        * A queue event has occurred, and we're going to schedule.  In
+        * this case, we can save a useless back to back clock update.
+        */
+       if (test_tsk_need_resched(p))
+               rq->skip_clock_update = 1;
 }
 
 static inline int cpu_of(struct rq *rq)
@@ -636,7 +644,8 @@ static inline int cpu_of(struct rq *rq)
 
 inline void update_rq_clock(struct rq *rq)
 {
-       rq->clock = sched_clock_cpu(cpu_of(rq));
+       if (!rq->skip_clock_update)
+               rq->clock = sched_clock_cpu(cpu_of(rq));
 }
 
 /*
@@ -914,16 +923,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
 
 /*
- * Check whether the task is waking, we use this to synchronize against
- * ttwu() so that task_cpu() reports a stable number.
- *
- * We need to make an exception for PF_STARTING tasks because the fork
- * path might require task_rq_lock() to work, eg. it can call
- * set_cpus_allowed_ptr() from the cpuset clone_ns code.
+ * Check whether the task is waking, we use this to synchronize ->cpus_allowed
+ * against ttwu().
  */
 static inline int task_is_waking(struct task_struct *p)
 {
-       return unlikely((p->state == TASK_WAKING) && !(p->flags & PF_STARTING));
+       return unlikely(p->state == TASK_WAKING);
 }
 
 /*
@@ -936,11 +941,9 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
        struct rq *rq;
 
        for (;;) {
-               while (task_is_waking(p))
-                       cpu_relax();
                rq = task_rq(p);
                raw_spin_lock(&rq->lock);
-               if (likely(rq == task_rq(p) && !task_is_waking(p)))
+               if (likely(rq == task_rq(p)))
                        return rq;
                raw_spin_unlock(&rq->lock);
        }
@@ -957,12 +960,10 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
        struct rq *rq;
 
        for (;;) {
-               while (task_is_waking(p))
-                       cpu_relax();
                local_irq_save(*flags);
                rq = task_rq(p);
                raw_spin_lock(&rq->lock);
-               if (likely(rq == task_rq(p) && !task_is_waking(p)))
+               if (likely(rq == task_rq(p)))
                        return rq;
                raw_spin_unlock_irqrestore(&rq->lock, *flags);
        }
@@ -1239,6 +1240,17 @@ void wake_up_idle_cpu(int cpu)
        if (!tsk_is_polling(rq->idle))
                smp_send_reschedule(cpu);
 }
+
+int nohz_ratelimit(int cpu)
+{
+       struct rq *rq = cpu_rq(cpu);
+       u64 diff = rq->clock - rq->nohz_stamp;
+
+       rq->nohz_stamp = rq->clock;
+
+       return diff < (NSEC_PER_SEC / HZ) >> 1;
+}
+
 #endif /* CONFIG_NO_HZ */
 
 static u64 sched_avg_period(void)
@@ -1781,8 +1793,6 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2)
                        raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
                }
        }
-       update_rq_clock(rq1);
-       update_rq_clock(rq2);
 }
 
 /*
@@ -1813,7 +1823,7 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
 }
 #endif
 
-static void calc_load_account_active(struct rq *this_rq);
+static void calc_load_account_idle(struct rq *this_rq);
 static void update_sysctl(void);
 static int get_update_sysctl_factor(void);
 
@@ -1870,62 +1880,43 @@ static void set_load_weight(struct task_struct *p)
        p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
 }
 
-static void update_avg(u64 *avg, u64 sample)
+static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 {
-       s64 diff = sample - *avg;
-       *avg += diff >> 3;
-}
-
-static void
-enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head)
-{
-       if (wakeup)
-               p->se.start_runtime = p->se.sum_exec_runtime;
-
+       update_rq_clock(rq);
        sched_info_queued(p);
-       p->sched_class->enqueue_task(rq, p, wakeup, head);
+       p->sched_class->enqueue_task(rq, p, flags);
        p->se.on_rq = 1;
 }
 
-static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
+static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
 {
-       if (sleep) {
-               if (p->se.last_wakeup) {
-                       update_avg(&p->se.avg_overlap,
-                               p->se.sum_exec_runtime - p->se.last_wakeup);
-                       p->se.last_wakeup = 0;
-               } else {
-                       update_avg(&p->se.avg_wakeup,
-                               sysctl_sched_wakeup_granularity);
-               }
-       }
-
+       update_rq_clock(rq);
        sched_info_dequeued(p);
-       p->sched_class->dequeue_task(rq, p, sleep);
+       p->sched_class->dequeue_task(rq, p, flags);
        p->se.on_rq = 0;
 }
 
 /*
  * activate_task - move a task to the runqueue.
  */
-static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
+static void activate_task(struct rq *rq, struct task_struct *p, int flags)
 {
        if (task_contributes_to_load(p))
                rq->nr_uninterruptible--;
 
-       enqueue_task(rq, p, wakeup, false);
+       enqueue_task(rq, p, flags);
        inc_nr_running(rq);
 }
 
 /*
  * deactivate_task - remove a task from the runqueue.
  */
-static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
+static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
 {
        if (task_contributes_to_load(p))
                rq->nr_uninterruptible++;
 
-       dequeue_task(rq, p, sleep);
+       dequeue_task(rq, p, flags);
        dec_nr_running(rq);
 }
 
@@ -2054,21 +2045,18 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
        __set_task_cpu(p, new_cpu);
 }
 
-struct migration_req {
-       struct list_head list;
-
+struct migration_arg {
        struct task_struct *task;
        int dest_cpu;
-
-       struct completion done;
 };
 
+static int migration_cpu_stop(void *data);
+
 /*
  * The task's runqueue lock must be held.
  * Returns true if you have to wait for migration thread.
  */
-static int
-migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
+static bool migrate_task(struct task_struct *p, int dest_cpu)
 {
        struct rq *rq = task_rq(p);
 
@@ -2076,15 +2064,7 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
         * If the task is not on a runqueue (and not running), then
         * the next wake-up will properly place the task.
         */
-       if (!p->se.on_rq && !task_running(rq, p))
-               return 0;
-
-       init_completion(&req->done);
-       req->task = p;
-       req->dest_cpu = dest_cpu;
-       list_add(&req->list, &rq->migration_queue);
-
-       return 1;
+       return p->se.on_rq || task_running(rq, p);
 }
 
 /*
@@ -2142,7 +2122,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
                 * just go back and repeat.
                 */
                rq = task_rq_lock(p, &flags);
-               trace_sched_wait_task(rq, p);
+               trace_sched_wait_task(p);
                running = task_running(rq, p);
                on_rq = p->se.on_rq;
                ncsw = 0;
@@ -2240,6 +2220,9 @@ void task_oncpu_function_call(struct task_struct *p,
 }
 
 #ifdef CONFIG_SMP
+/*
+ * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held.
+ */
 static int select_fallback_rq(int cpu, struct task_struct *p)
 {
        int dest_cpu;
@@ -2256,12 +2239,8 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
                return dest_cpu;
 
        /* No more Mr. Nice Guy. */
-       if (dest_cpu >= nr_cpu_ids) {
-               rcu_read_lock();
-               cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
-               rcu_read_unlock();
-               dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
-
+       if (unlikely(dest_cpu >= nr_cpu_ids)) {
+               dest_cpu = cpuset_cpus_allowed_fallback(p);
                /*
                 * Don't tell them about moving exiting tasks or
                 * kernel threads (both mm NULL), since they never
@@ -2278,17 +2257,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
 }
 
 /*
- * Gets called from 3 sites (exec, fork, wakeup), since it is called without
- * holding rq->lock we need to ensure ->cpus_allowed is stable, this is done
- * by:
- *
- *  exec:           is unstable, retry loop
- *  fork & wake-up: serialize ->cpus_allowed against TASK_WAKING
+ * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable.
  */
 static inline
-int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
+int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags)
 {
-       int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
+       int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags);
 
        /*
         * In order not to call set_task_cpu() on a blocking task we need
@@ -2306,6 +2280,12 @@ int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
 
        return cpu;
 }
+
+static void update_avg(u64 *avg, u64 sample)
+{
+       s64 diff = sample - *avg;
+       *avg += diff >> 3;
+}
 #endif
 
 /***
@@ -2327,16 +2307,13 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
 {
        int cpu, orig_cpu, this_cpu, success = 0;
        unsigned long flags;
+       unsigned long en_flags = ENQUEUE_WAKEUP;
        struct rq *rq;
 
-       if (!sched_feat(SYNC_WAKEUPS))
-               wake_flags &= ~WF_SYNC;
-
        this_cpu = get_cpu();
 
        smp_wmb();
        rq = task_rq_lock(p, &flags);
-       update_rq_clock(rq);
        if (!(p->state & state))
                goto out;
 
@@ -2356,28 +2333,26 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
         *
         * First fix up the nr_uninterruptible count:
         */
-       if (task_contributes_to_load(p))
-               rq->nr_uninterruptible--;
+       if (task_contributes_to_load(p)) {
+               if (likely(cpu_online(orig_cpu)))
+                       rq->nr_uninterruptible--;
+               else
+                       this_rq()->nr_uninterruptible--;
+       }
        p->state = TASK_WAKING;
 
-       if (p->sched_class->task_waking)
+       if (p->sched_class->task_waking) {
                p->sched_class->task_waking(rq, p);
+               en_flags |= ENQUEUE_WAKING;
+       }
 
-       __task_rq_unlock(rq);
-
-       cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
-       if (cpu != orig_cpu) {
-               /*
-                * Since we migrate the task without holding any rq->lock,
-                * we need to be careful with task_rq_lock(), since that
-                * might end up locking an invalid rq.
-                */
+       cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags);
+       if (cpu != orig_cpu)
                set_task_cpu(p, cpu);
-       }
+       __task_rq_unlock(rq);
 
        rq = cpu_rq(cpu);
        raw_spin_lock(&rq->lock);
-       update_rq_clock(rq);
 
        /*
         * We migrated the task without holding either rq->lock, however
@@ -2405,36 +2380,20 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
 
 out_activate:
 #endif /* CONFIG_SMP */
-       schedstat_inc(p, se.nr_wakeups);
+       schedstat_inc(p, se.statistics.nr_wakeups);
        if (wake_flags & WF_SYNC)
-               schedstat_inc(p, se.nr_wakeups_sync);
+               schedstat_inc(p, se.statistics.nr_wakeups_sync);
        if (orig_cpu != cpu)
-               schedstat_inc(p, se.nr_wakeups_migrate);
+               schedstat_inc(p, se.statistics.nr_wakeups_migrate);
        if (cpu == this_cpu)
-               schedstat_inc(p, se.nr_wakeups_local);
+               schedstat_inc(p, se.statistics.nr_wakeups_local);
        else
-               schedstat_inc(p, se.nr_wakeups_remote);
-       activate_task(rq, p, 1);
+               schedstat_inc(p, se.statistics.nr_wakeups_remote);
+       activate_task(rq, p, en_flags);
        success = 1;
 
-       /*
-        * Only attribute actual wakeups done by this task.
-        */
-       if (!in_interrupt()) {
-               struct sched_entity *se = &current->se;
-               u64 sample = se->sum_exec_runtime;
-
-               if (se->last_wakeup)
-                       sample -= se->last_wakeup;
-               else
-                       sample -= se->start_runtime;
-               update_avg(&se->avg_wakeup, sample);
-
-               se->last_wakeup = se->sum_exec_runtime;
-       }
-
 out_running:
-       trace_sched_wakeup(rq, p, success);
+       trace_sched_wakeup(p, success);
        check_preempt_curr(rq, p, wake_flags);
 
        p->state = TASK_RUNNING;
@@ -2494,42 +2453,9 @@ static void __sched_fork(struct task_struct *p)
        p->se.sum_exec_runtime          = 0;
        p->se.prev_sum_exec_runtime     = 0;
        p->se.nr_migrations             = 0;
-       p->se.last_wakeup               = 0;
-       p->se.avg_overlap               = 0;
-       p->se.start_runtime             = 0;
-       p->se.avg_wakeup                = sysctl_sched_wakeup_granularity;
 
 #ifdef CONFIG_SCHEDSTATS
-       p->se.wait_start                        = 0;
-       p->se.wait_max                          = 0;
-       p->se.wait_count                        = 0;
-       p->se.wait_sum                          = 0;
-
-       p->se.sleep_start                       = 0;
-       p->se.sleep_max                         = 0;
-       p->se.sum_sleep_runtime                 = 0;
-
-       p->se.block_start                       = 0;
-       p->se.block_max                         = 0;
-       p->se.exec_max                          = 0;
-       p->se.slice_max                         = 0;
-
-       p->se.nr_migrations_cold                = 0;
-       p->se.nr_failed_migrations_affine       = 0;
-       p->se.nr_failed_migrations_running      = 0;
-       p->se.nr_failed_migrations_hot          = 0;
-       p->se.nr_forced_migrations              = 0;
-
-       p->se.nr_wakeups                        = 0;
-       p->se.nr_wakeups_sync                   = 0;
-       p->se.nr_wakeups_migrate                = 0;
-       p->se.nr_wakeups_local                  = 0;
-       p->se.nr_wakeups_remote                 = 0;
-       p->se.nr_wakeups_affine                 = 0;
-       p->se.nr_wakeups_affine_attempts        = 0;
-       p->se.nr_wakeups_passive                = 0;
-       p->se.nr_wakeups_idle                   = 0;
-
+       memset(&p->se.statistics, 0, sizeof(p->se.statistics));
 #endif
 
        INIT_LIST_HEAD(&p->rt.run_list);
@@ -2550,11 +2476,11 @@ void sched_fork(struct task_struct *p, int clone_flags)
 
        __sched_fork(p);
        /*
-        * We mark the process as waking here. This guarantees that
+        * We mark the process as running here. This guarantees that
         * nobody will actually run it, and a signal or other external
         * event cannot wake it up and insert it on the runqueue either.
         */
-       p->state = TASK_WAKING;
+       p->state = TASK_RUNNING;
 
        /*
         * Revert to default priority/policy on fork if requested.
@@ -2621,31 +2547,27 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
        int cpu __maybe_unused = get_cpu();
 
 #ifdef CONFIG_SMP
+       rq = task_rq_lock(p, &flags);
+       p->state = TASK_WAKING;
+
        /*
         * Fork balancing, do it here and not earlier because:
         *  - cpus_allowed can change in the fork path
         *  - any previously selected cpu might disappear through hotplug
         *
-        * We still have TASK_WAKING but PF_STARTING is gone now, meaning
-        * ->cpus_allowed is stable, we have preemption disabled, meaning
-        * cpu_online_mask is stable.
+        * We set TASK_WAKING so that select_task_rq() can drop rq->lock
+        * without people poking at ->cpus_allowed.
         */
-       cpu = select_task_rq(p, SD_BALANCE_FORK, 0);
+       cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0);
        set_task_cpu(p, cpu);
-#endif
 
-       /*
-        * Since the task is not on the rq and we still have TASK_WAKING set
-        * nobody else will migrate this task.
-        */
-       rq = cpu_rq(cpu);
-       raw_spin_lock_irqsave(&rq->lock, flags);
-
-       BUG_ON(p->state != TASK_WAKING);
        p->state = TASK_RUNNING;
-       update_rq_clock(rq);
+       task_rq_unlock(rq, &flags);
+#endif
+
+       rq = task_rq_lock(p, &flags);
        activate_task(rq, p, 0);
-       trace_sched_wakeup_new(rq, p, 1);
+       trace_sched_wakeup_new(p, 1);
        check_preempt_curr(rq, p, WF_FORK);
 #ifdef CONFIG_SMP
        if (p->sched_class->task_woken)
@@ -2865,7 +2787,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
        struct mm_struct *mm, *oldmm;
 
        prepare_task_switch(rq, prev, next);
-       trace_sched_switch(rq, prev, next);
+       trace_sched_switch(prev, next);
        mm = next->mm;
        oldmm = prev->active_mm;
        /*
@@ -2982,6 +2904,61 @@ static unsigned long calc_load_update;
 unsigned long avenrun[3];
 EXPORT_SYMBOL(avenrun);
 
+static long calc_load_fold_active(struct rq *this_rq)
+{
+       long nr_active, delta = 0;
+
+       nr_active = this_rq->nr_running;
+       nr_active += (long) this_rq->nr_uninterruptible;
+
+       if (nr_active != this_rq->calc_load_active) {
+               delta = nr_active - this_rq->calc_load_active;
+               this_rq->calc_load_active = nr_active;
+       }
+
+       return delta;
+}
+
+#ifdef CONFIG_NO_HZ
+/*
+ * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
+ *
+ * When making the ILB scale, we should try to pull this in as well.
+ */
+static atomic_long_t calc_load_tasks_idle;
+
+static void calc_load_account_idle(struct rq *this_rq)
+{
+       long delta;
+
+       delta = calc_load_fold_active(this_rq);
+       if (delta)
+               atomic_long_add(delta, &calc_load_tasks_idle);
+}
+
+static long calc_load_fold_idle(void)
+{
+       long delta = 0;
+
+       /*
+        * Its got a race, we don't care...
+        */
+       if (atomic_long_read(&calc_load_tasks_idle))
+               delta = atomic_long_xchg(&calc_load_tasks_idle, 0);
+
+       return delta;
+}
+#else
+static void calc_load_account_idle(struct rq *this_rq)
+{
+}
+
+static inline long calc_load_fold_idle(void)
+{
+       return 0;
+}
+#endif
+
 /**
  * get_avenrun - get the load average array
  * @loads:     pointer to dest load array
@@ -3028,20 +3005,22 @@ void calc_global_load(void)
 }
 
 /*
- * Either called from update_cpu_load() or from a cpu going idle
+ * Called from update_cpu_load() to periodically update this CPU's
+ * active count.
  */
 static void calc_load_account_active(struct rq *this_rq)
 {
-       long nr_active, delta;
+       long delta;
 
-       nr_active = this_rq->nr_running;
-       nr_active += (long) this_rq->nr_uninterruptible;
+       if (time_before(jiffies, this_rq->calc_load_update))
+               return;
 
-       if (nr_active != this_rq->calc_load_active) {
-               delta = nr_active - this_rq->calc_load_active;
-               this_rq->calc_load_active = nr_active;
+       delta  = calc_load_fold_active(this_rq);
+       delta += calc_load_fold_idle();
+       if (delta)
                atomic_long_add(delta, &calc_load_tasks);
-       }
+
+       this_rq->calc_load_update += LOAD_FREQ;
 }
 
 /*
@@ -3073,10 +3052,7 @@ static void update_cpu_load(struct rq *this_rq)
                this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
        }
 
-       if (time_after_eq(jiffies, this_rq->calc_load_update)) {
-               this_rq->calc_load_update += LOAD_FREQ;
-               calc_load_account_active(this_rq);
-       }
+       calc_load_account_active(this_rq);
 }
 
 #ifdef CONFIG_SMP
@@ -3088,44 +3064,27 @@ static void update_cpu_load(struct rq *this_rq)
 void sched_exec(void)
 {
        struct task_struct *p = current;
-       struct migration_req req;
-       int dest_cpu, this_cpu;
        unsigned long flags;
        struct rq *rq;
-
-again:
-       this_cpu = get_cpu();
-       dest_cpu = select_task_rq(p, SD_BALANCE_EXEC, 0);
-       if (dest_cpu == this_cpu) {
-               put_cpu();
-               return;
-       }
+       int dest_cpu;
 
        rq = task_rq_lock(p, &flags);
-       put_cpu();
+       dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0);
+       if (dest_cpu == smp_processor_id())
+               goto unlock;
 
        /*
         * select_task_rq() can race against ->cpus_allowed
         */
-       if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)
-           || unlikely(!cpu_active(dest_cpu))) {
-               task_rq_unlock(rq, &flags);
-               goto again;
-       }
+       if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
+           likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) {
+               struct migration_arg arg = { p, dest_cpu };
 
-       /* force the process onto the specified CPU */
-       if (migrate_task(p, dest_cpu, &req)) {
-               /* Need to wait for migration thread (might exit: take ref). */
-               struct task_struct *mt = rq->migration_thread;
-
-               get_task_struct(mt);
                task_rq_unlock(rq, &flags);
-               wake_up_process(mt);
-               put_task_struct(mt);
-               wait_for_completion(&req.done);
-
+               stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
                return;
        }
+unlock:
        task_rq_unlock(rq, &flags);
 }
 
@@ -3597,23 +3556,9 @@ static inline void schedule_debug(struct task_struct *prev)
 
 static void put_prev_task(struct rq *rq, struct task_struct *prev)
 {
-       if (prev->state == TASK_RUNNING) {
-               u64 runtime = prev->se.sum_exec_runtime;
-
-               runtime -= prev->se.prev_sum_exec_runtime;
-               runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
-
-               /*
-                * In order to avoid avg_overlap growing stale when we are
-                * indeed overlapping and hence not getting put to sleep, grow
-                * the avg_overlap on preemption.
-                *
-                * We use the average preemption runtime because that
-                * correlates to the amount of cache footprint a task can
-                * build up.
-                */
-               update_avg(&prev->se.avg_overlap, runtime);
-       }
+       if (prev->se.on_rq)
+               update_rq_clock(rq);
+       rq->skip_clock_update = 0;
        prev->sched_class->put_prev_task(rq, prev);
 }
 
@@ -3676,14 +3621,13 @@ need_resched_nonpreemptible:
                hrtick_clear(rq);
 
        raw_spin_lock_irq(&rq->lock);
-       update_rq_clock(rq);
        clear_tsk_need_resched(prev);
 
        if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
                if (unlikely(signal_pending_state(prev->state, prev)))
                        prev->state = TASK_RUNNING;
                else
-                       deactivate_task(rq, prev, 1);
+                       deactivate_task(rq, prev, DEQUEUE_SLEEP);
                switch_count = &prev->nvcsw;
        }
 
@@ -4006,8 +3950,7 @@ do_wait_for_common(struct completion *x, long timeout, int state)
        if (!x->done) {
                DECLARE_WAITQUEUE(wait, current);
 
-               wait.flags |= WQ_FLAG_EXCLUSIVE;
-               __add_wait_queue_tail(&x->wait, &wait);
+               __add_wait_queue_tail_exclusive(&x->wait, &wait);
                do {
                        if (signal_pending_state(state, current)) {
                                timeout = -ERESTARTSYS;
@@ -4233,7 +4176,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
        BUG_ON(prio < 0 || prio > MAX_PRIO);
 
        rq = task_rq_lock(p, &flags);
-       update_rq_clock(rq);
 
        oldprio = p->prio;
        prev_class = p->sched_class;
@@ -4254,7 +4196,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
        if (running)
                p->sched_class->set_curr_task(rq);
        if (on_rq) {
-               enqueue_task(rq, p, 0, oldprio < prio);
+               enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
 
                check_class_changed(rq, p, prev_class, oldprio, running);
        }
@@ -4276,7 +4218,6 @@ void set_user_nice(struct task_struct *p, long nice)
         * the task might be in the middle of scheduling on another CPU.
         */
        rq = task_rq_lock(p, &flags);
-       update_rq_clock(rq);
        /*
         * The RT priorities are set via sched_setscheduler(), but we still
         * allow the 'normal' nice value to be set - but as expected
@@ -4298,7 +4239,7 @@ void set_user_nice(struct task_struct *p, long nice)
        delta = p->prio - old_prio;
 
        if (on_rq) {
-               enqueue_task(rq, p, 0, false);
+               enqueue_task(rq, p, 0);
                /*
                 * If the task increased its priority or is running and
                 * lowered its priority, then reschedule its CPU:
@@ -4559,7 +4500,6 @@ recheck:
                raw_spin_unlock_irqrestore(&p->pi_lock, flags);
                goto recheck;
        }
-       update_rq_clock(rq);
        on_rq = p->se.on_rq;
        running = task_current(rq, p);
        if (on_rq)
@@ -5296,17 +5236,15 @@ static inline void sched_init_granularity(void)
 /*
  * This is how migration works:
  *
- * 1) we queue a struct migration_req structure in the source CPU's
- *    runqueue and wake up that CPU's migration thread.
- * 2) we down() the locked semaphore => thread blocks.
- * 3) migration thread wakes up (implicitly it forces the migrated
- *    thread off the CPU)
- * 4) it gets the migration request and checks whether the migrated
- *    task is still in the wrong runqueue.
- * 5) if it's in the wrong runqueue then the migration thread removes
+ * 1) we invoke migration_cpu_stop() on the target CPU using
+ *    stop_one_cpu().
+ * 2) stopper starts to run (implicitly forcing the migrated thread
+ *    off the CPU)
+ * 3) it checks whether the migrated task is still in the wrong runqueue.
+ * 4) if it's in the wrong runqueue then the migration thread removes
  *    it and puts it into the right queue.
- * 6) migration thread up()s the semaphore.
- * 7) we wake up and the migration is done.
+ * 5) stopper completes and stop_one_cpu() returns and the migration
+ *    is done.
  */
 
 /*
@@ -5320,12 +5258,23 @@ static inline void sched_init_granularity(void)
  */
 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
 {
-       struct migration_req req;
        unsigned long flags;
        struct rq *rq;
+       unsigned int dest_cpu;
        int ret = 0;
 
+       /*
+        * Serialize against TASK_WAKING so that ttwu() and wunt() can
+        * drop the rq->lock and still rely on ->cpus_allowed.
+        */
+again:
+       while (task_is_waking(p))
+               cpu_relax();
        rq = task_rq_lock(p, &flags);
+       if (task_is_waking(p)) {
+               task_rq_unlock(rq, &flags);
+               goto again;
+       }
 
        if (!cpumask_intersects(new_mask, cpu_active_mask)) {
                ret = -EINVAL;
@@ -5349,15 +5298,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
        if (cpumask_test_cpu(task_cpu(p), new_mask))
                goto out;
 
-       if (migrate_task(p, cpumask_any_and(cpu_active_mask, new_mask), &req)) {
+       dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
+       if (migrate_task(p, dest_cpu)) {
+               struct migration_arg arg = { p, dest_cpu };
                /* Need help from migration thread: drop lock and wait. */
-               struct task_struct *mt = rq->migration_thread;
-
-               get_task_struct(mt);
                task_rq_unlock(rq, &flags);
-               wake_up_process(mt);
-               put_task_struct(mt);
-               wait_for_completion(&req.done);
+               stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
                tlb_migrate_finish(p->mm);
                return 0;
        }
@@ -5415,98 +5361,49 @@ fail:
        return ret;
 }
 
-#define RCU_MIGRATION_IDLE     0
-#define RCU_MIGRATION_NEED_QS  1
-#define RCU_MIGRATION_GOT_QS   2
-#define RCU_MIGRATION_MUST_SYNC        3
-
 /*
- * migration_thread - this is a highprio system thread that performs
- * thread migration by bumping thread off CPU then 'pushing' onto
- * another runqueue.
+ * migration_cpu_stop - this will be executed by a highprio stopper thread
+ * and performs thread migration by bumping thread off CPU then
+ * 'pushing' onto another runqueue.
  */
-static int migration_thread(void *data)
+static int migration_cpu_stop(void *data)
 {
-       int badcpu;
-       int cpu = (long)data;
-       struct rq *rq;
-
-       rq = cpu_rq(cpu);
-       BUG_ON(rq->migration_thread != current);
-
-       set_current_state(TASK_INTERRUPTIBLE);
-       while (!kthread_should_stop()) {
-               struct migration_req *req;
-               struct list_head *head;
-
-               raw_spin_lock_irq(&rq->lock);
-
-               if (cpu_is_offline(cpu)) {
-                       raw_spin_unlock_irq(&rq->lock);
-                       break;
-               }
-
-               if (rq->active_balance) {
-                       active_load_balance(rq, cpu);
-                       rq->active_balance = 0;
-               }
-
-               head = &rq->migration_queue;
-
-               if (list_empty(head)) {
-                       raw_spin_unlock_irq(&rq->lock);
-                       schedule();
-                       set_current_state(TASK_INTERRUPTIBLE);
-                       continue;
-               }
-               req = list_entry(head->next, struct migration_req, list);
-               list_del_init(head->next);
-
-               if (req->task != NULL) {
-                       raw_spin_unlock(&rq->lock);
-                       __migrate_task(req->task, cpu, req->dest_cpu);
-               } else if (likely(cpu == (badcpu = smp_processor_id()))) {
-                       req->dest_cpu = RCU_MIGRATION_GOT_QS;
-                       raw_spin_unlock(&rq->lock);
-               } else {
-                       req->dest_cpu = RCU_MIGRATION_MUST_SYNC;
-                       raw_spin_unlock(&rq->lock);
-                       WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu);
-               }
-               local_irq_enable();
-
-               complete(&req->done);
-       }
-       __set_current_state(TASK_RUNNING);
-
-       return 0;
-}
-
-#ifdef CONFIG_HOTPLUG_CPU
-
-static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
-{
-       int ret;
+       struct migration_arg *arg = data;
 
+       /*
+        * The original target cpu might have gone down and we might
+        * be on another cpu but it doesn't matter.
+        */
        local_irq_disable();
-       ret = __migrate_task(p, src_cpu, dest_cpu);
+       __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
        local_irq_enable();
-       return ret;
+       return 0;
 }
 
+#ifdef CONFIG_HOTPLUG_CPU
 /*
  * Figure out where task on dead CPU should go, use force if necessary.
  */
-static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
+void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
 {
-       int dest_cpu;
+       struct rq *rq = cpu_rq(dead_cpu);
+       int needs_cpu, uninitialized_var(dest_cpu);
+       unsigned long flags;
 
-again:
-       dest_cpu = select_fallback_rq(dead_cpu, p);
+       local_irq_save(flags);
 
-       /* It can have affinity changed while we were choosing. */
-       if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu)))
-               goto again;
+       raw_spin_lock(&rq->lock);
+       needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING);
+       if (needs_cpu)
+               dest_cpu = select_fallback_rq(dead_cpu, p);
+       raw_spin_unlock(&rq->lock);
+       /*
+        * It can only fail if we race with set_cpus_allowed(),
+        * in the racer should migrate the task anyway.
+        */
+       if (needs_cpu)
+               __migrate_task(p, dead_cpu, dest_cpu);
+       local_irq_restore(flags);
 }
 
 /*
@@ -5570,7 +5467,6 @@ void sched_idle_next(void)
 
        __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
 
-       update_rq_clock(rq);
        activate_task(rq, p, 0);
 
        raw_spin_unlock_irqrestore(&rq->lock, flags);
@@ -5625,7 +5521,6 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
        for ( ; ; ) {
                if (!rq->nr_running)
                        break;
-               update_rq_clock(rq);
                next = pick_next_task(rq);
                if (!next)
                        break;
@@ -5848,35 +5743,20 @@ static void set_rq_offline(struct rq *rq)
 static int __cpuinit
 migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
-       struct task_struct *p;
        int cpu = (long)hcpu;
        unsigned long flags;
-       struct rq *rq;
+       struct rq *rq = cpu_rq(cpu);
 
        switch (action) {
 
        case CPU_UP_PREPARE:
        case CPU_UP_PREPARE_FROZEN:
-               p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
-               if (IS_ERR(p))
-                       return NOTIFY_BAD;
-               kthread_bind(p, cpu);
-               /* Must be high prio: stop_machine expects to yield to it. */
-               rq = task_rq_lock(p, &flags);
-               __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
-               task_rq_unlock(rq, &flags);
-               get_task_struct(p);
-               cpu_rq(cpu)->migration_thread = p;
                rq->calc_load_update = calc_load_update;
                break;
 
        case CPU_ONLINE:
        case CPU_ONLINE_FROZEN:
-               /* Strictly unnecessary, as first user will wake it. */
-               wake_up_process(cpu_rq(cpu)->migration_thread);
-
                /* Update our root-domain */
-               rq = cpu_rq(cpu);
                raw_spin_lock_irqsave(&rq->lock, flags);
                if (rq->rd) {
                        BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
@@ -5887,61 +5767,24 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                break;
 
 #ifdef CONFIG_HOTPLUG_CPU
-       case CPU_UP_CANCELED:
-       case CPU_UP_CANCELED_FROZEN:
-               if (!cpu_rq(cpu)->migration_thread)
-                       break;
-               /* Unbind it from offline cpu so it can run. Fall thru. */
-               kthread_bind(cpu_rq(cpu)->migration_thread,
-                            cpumask_any(cpu_online_mask));
-               kthread_stop(cpu_rq(cpu)->migration_thread);
-               put_task_struct(cpu_rq(cpu)->migration_thread);
-               cpu_rq(cpu)->migration_thread = NULL;
-               break;
-
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
-               cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */
                migrate_live_tasks(cpu);
-               rq = cpu_rq(cpu);
-               kthread_stop(rq->migration_thread);
-               put_task_struct(rq->migration_thread);
-               rq->migration_thread = NULL;
                /* Idle task back to normal (off runqueue, low prio) */
                raw_spin_lock_irq(&rq->lock);
-               update_rq_clock(rq);
                deactivate_task(rq, rq->idle, 0);
                __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
                rq->idle->sched_class = &idle_sched_class;
                migrate_dead_tasks(cpu);
                raw_spin_unlock_irq(&rq->lock);
-               cpuset_unlock();
                migrate_nr_uninterruptible(rq);
                BUG_ON(rq->nr_running != 0);
                calc_global_load_remove(rq);
-               /*
-                * No need to migrate the tasks: it was best-effort if
-                * they didn't take sched_hotcpu_mutex. Just wake up
-                * the requestors.
-                */
-               raw_spin_lock_irq(&rq->lock);
-               while (!list_empty(&rq->migration_queue)) {
-                       struct migration_req *req;
-
-                       req = list_entry(rq->migration_queue.next,
-                                        struct migration_req, list);
-                       list_del_init(&req->list);
-                       raw_spin_unlock_irq(&rq->lock);
-                       complete(&req->done);
-                       raw_spin_lock_irq(&rq->lock);
-               }
-               raw_spin_unlock_irq(&rq->lock);
                break;
 
        case CPU_DYING:
        case CPU_DYING_FROZEN:
                /* Update our root-domain */
-               rq = cpu_rq(cpu);
                raw_spin_lock_irqsave(&rq->lock, flags);
                if (rq->rd) {
                        BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
@@ -6272,6 +6115,9 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
        struct rq *rq = cpu_rq(cpu);
        struct sched_domain *tmp;
 
+       for (tmp = sd; tmp; tmp = tmp->parent)
+               tmp->span_weight = cpumask_weight(sched_domain_span(tmp));
+
        /* Remove the sched domains which do not contribute to scheduling. */
        for (tmp = sd; tmp; ) {
                struct sched_domain *parent = tmp->parent;
@@ -7755,10 +7601,8 @@ void __init sched_init(void)
                rq->push_cpu = 0;
                rq->cpu = i;
                rq->online = 0;
-               rq->migration_thread = NULL;
                rq->idle_stamp = 0;
                rq->avg_idle = 2*sysctl_sched_migration_cost;
-               INIT_LIST_HEAD(&rq->migration_queue);
                rq_attach_root(rq, &def_root_domain);
 #endif
                init_rq_hrtick(rq);
@@ -7859,7 +7703,6 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
 {
        int on_rq;
 
-       update_rq_clock(rq);
        on_rq = p->se.on_rq;
        if (on_rq)
                deactivate_task(rq, p, 0);
@@ -7886,9 +7729,9 @@ void normalize_rt_tasks(void)
 
                p->se.exec_start                = 0;
 #ifdef CONFIG_SCHEDSTATS
-               p->se.wait_start                = 0;
-               p->se.sleep_start               = 0;
-               p->se.block_start               = 0;
+               p->se.statistics.wait_start     = 0;
+               p->se.statistics.sleep_start    = 0;
+               p->se.statistics.block_start    = 0;
 #endif
 
                if (!rt_task(p)) {
@@ -8221,8 +8064,6 @@ void sched_move_task(struct task_struct *tsk)
 
        rq = task_rq_lock(tsk, &flags);
 
-       update_rq_clock(rq);
-
        running = task_current(rq, tsk);
        on_rq = tsk->se.on_rq;
 
@@ -8241,7 +8082,7 @@ void sched_move_task(struct task_struct *tsk)
        if (unlikely(running))
                tsk->sched_class->set_curr_task(rq);
        if (on_rq)
-               enqueue_task(rq, tsk, 0, false);
+               enqueue_task(rq, tsk, 0);
 
        task_rq_unlock(rq, &flags);
 }
@@ -9055,43 +8896,32 @@ struct cgroup_subsys cpuacct_subsys = {
 
 #ifndef CONFIG_SMP
 
-int rcu_expedited_torture_stats(char *page)
-{
-       return 0;
-}
-EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
-
 void synchronize_sched_expedited(void)
 {
+       barrier();
 }
 EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
 
 #else /* #ifndef CONFIG_SMP */
 
-static DEFINE_PER_CPU(struct migration_req, rcu_migration_req);
-static DEFINE_MUTEX(rcu_sched_expedited_mutex);
-
-#define RCU_EXPEDITED_STATE_POST -2
-#define RCU_EXPEDITED_STATE_IDLE -1
-
-static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
+static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
 
-int rcu_expedited_torture_stats(char *page)
+static int synchronize_sched_expedited_cpu_stop(void *data)
 {
-       int cnt = 0;
-       int cpu;
-
-       cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state);
-       for_each_online_cpu(cpu) {
-                cnt += sprintf(&page[cnt], " %d:%d",
-                               cpu, per_cpu(rcu_migration_req, cpu).dest_cpu);
-       }
-       cnt += sprintf(&page[cnt], "\n");
-       return cnt;
+       /*
+        * There must be a full memory barrier on each affected CPU
+        * between the time that try_stop_cpus() is called and the
+        * time that it returns.
+        *
+        * In the current initial implementation of cpu_stop, the
+        * above condition is already met when the control reaches
+        * this point and the following smp_mb() is not strictly
+        * necessary.  Do smp_mb() anyway for documentation and
+        * robustness against future implementation changes.
+        */
+       smp_mb(); /* See above comment block. */
+       return 0;
 }
-EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
-
-static long synchronize_sched_expedited_count;
 
 /*
  * Wait for an rcu-sched grace period to elapse, but use "big hammer"
@@ -9105,18 +8935,14 @@ static long synchronize_sched_expedited_count;
  */
 void synchronize_sched_expedited(void)
 {
-       int cpu;
-       unsigned long flags;
-       bool need_full_sync = 0;
-       struct rq *rq;
-       struct migration_req *req;
-       long snap;
-       int trycount = 0;
+       int snap, trycount = 0;
 
        smp_mb();  /* ensure prior mod happens before capturing snap. */
-       snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1;
+       snap = atomic_read(&synchronize_sched_expedited_count) + 1;
        get_online_cpus();
-       while (!mutex_trylock(&rcu_sched_expedited_mutex)) {
+       while (try_stop_cpus(cpu_online_mask,
+                            synchronize_sched_expedited_cpu_stop,
+                            NULL) == -EAGAIN) {
                put_online_cpus();
                if (trycount++ < 10)
                        udelay(trycount * num_online_cpus());
@@ -9124,41 +8950,15 @@ void synchronize_sched_expedited(void)
                        synchronize_sched();
                        return;
                }
-               if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) {
+               if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
                        smp_mb(); /* ensure test happens before caller kfree */
                        return;
                }
                get_online_cpus();
        }
-       rcu_expedited_state = RCU_EXPEDITED_STATE_POST;
-       for_each_online_cpu(cpu) {
-               rq = cpu_rq(cpu);
-               req = &per_cpu(rcu_migration_req, cpu);
-               init_completion(&req->done);
-               req->task = NULL;
-               req->dest_cpu = RCU_MIGRATION_NEED_QS;
-               raw_spin_lock_irqsave(&rq->lock, flags);
-               list_add(&req->list, &rq->migration_queue);
-               raw_spin_unlock_irqrestore(&rq->lock, flags);
-               wake_up_process(rq->migration_thread);
-       }
-       for_each_online_cpu(cpu) {
-               rcu_expedited_state = cpu;
-               req = &per_cpu(rcu_migration_req, cpu);
-               rq = cpu_rq(cpu);
-               wait_for_completion(&req->done);
-               raw_spin_lock_irqsave(&rq->lock, flags);
-               if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC))
-                       need_full_sync = 1;
-               req->dest_cpu = RCU_MIGRATION_IDLE;
-               raw_spin_unlock_irqrestore(&rq->lock, flags);
-       }
-       rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
-       synchronize_sched_expedited_count++;
-       mutex_unlock(&rcu_sched_expedited_mutex);
+       atomic_inc(&synchronize_sched_expedited_count);
+       smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
        put_online_cpus();
-       if (need_full_sync)
-               synchronize_sched();
 }
 EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
 
index 9b49db1440372bd6227e7e4258a0a6d78903e230..9cf1baf6616af14959b30f00357bbaef09b4a817 100644 (file)
@@ -70,16 +70,16 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu,
        PN(se->vruntime);
        PN(se->sum_exec_runtime);
 #ifdef CONFIG_SCHEDSTATS
-       PN(se->wait_start);
-       PN(se->sleep_start);
-       PN(se->block_start);
-       PN(se->sleep_max);
-       PN(se->block_max);
-       PN(se->exec_max);
-       PN(se->slice_max);
-       PN(se->wait_max);
-       PN(se->wait_sum);
-       P(se->wait_count);
+       PN(se->statistics.wait_start);
+       PN(se->statistics.sleep_start);
+       PN(se->statistics.block_start);
+       PN(se->statistics.sleep_max);
+       PN(se->statistics.block_max);
+       PN(se->statistics.exec_max);
+       PN(se->statistics.slice_max);
+       PN(se->statistics.wait_max);
+       PN(se->statistics.wait_sum);
+       P(se->statistics.wait_count);
 #endif
        P(se->load.weight);
 #undef PN
@@ -104,7 +104,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
        SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
                SPLIT_NS(p->se.vruntime),
                SPLIT_NS(p->se.sum_exec_runtime),
-               SPLIT_NS(p->se.sum_sleep_runtime));
+               SPLIT_NS(p->se.statistics.sum_sleep_runtime));
 #else
        SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
                0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
@@ -173,11 +173,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
        task_group_path(tg, path, sizeof(path));
 
        SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
-#elif defined(CONFIG_USER_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
-       {
-               uid_t uid = cfs_rq->tg->uid;
-               SEQ_printf(m, "\ncfs_rq[%d] for UID: %u\n", cpu, uid);
-       }
 #else
        SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
 #endif
@@ -407,40 +402,38 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
        PN(se.exec_start);
        PN(se.vruntime);
        PN(se.sum_exec_runtime);
-       PN(se.avg_overlap);
-       PN(se.avg_wakeup);
 
        nr_switches = p->nvcsw + p->nivcsw;
 
 #ifdef CONFIG_SCHEDSTATS
-       PN(se.wait_start);
-       PN(se.sleep_start);
-       PN(se.block_start);
-       PN(se.sleep_max);
-       PN(se.block_max);
-       PN(se.exec_max);
-       PN(se.slice_max);
-       PN(se.wait_max);
-       PN(se.wait_sum);
-       P(se.wait_count);
-       PN(se.iowait_sum);
-       P(se.iowait_count);
+       PN(se.statistics.wait_start);
+       PN(se.statistics.sleep_start);
+       PN(se.statistics.block_start);
+       PN(se.statistics.sleep_max);
+       PN(se.statistics.block_max);
+       PN(se.statistics.exec_max);
+       PN(se.statistics.slice_max);
+       PN(se.statistics.wait_max);
+       PN(se.statistics.wait_sum);
+       P(se.statistics.wait_count);
+       PN(se.statistics.iowait_sum);
+       P(se.statistics.iowait_count);
        P(sched_info.bkl_count);
        P(se.nr_migrations);
-       P(se.nr_migrations_cold);
-       P(se.nr_failed_migrations_affine);
-       P(se.nr_failed_migrations_running);
-       P(se.nr_failed_migrations_hot);
-       P(se.nr_forced_migrations);
-       P(se.nr_wakeups);
-       P(se.nr_wakeups_sync);
-       P(se.nr_wakeups_migrate);
-       P(se.nr_wakeups_local);
-       P(se.nr_wakeups_remote);
-       P(se.nr_wakeups_affine);
-       P(se.nr_wakeups_affine_attempts);
-       P(se.nr_wakeups_passive);
-       P(se.nr_wakeups_idle);
+       P(se.statistics.nr_migrations_cold);
+       P(se.statistics.nr_failed_migrations_affine);
+       P(se.statistics.nr_failed_migrations_running);
+       P(se.statistics.nr_failed_migrations_hot);
+       P(se.statistics.nr_forced_migrations);
+       P(se.statistics.nr_wakeups);
+       P(se.statistics.nr_wakeups_sync);
+       P(se.statistics.nr_wakeups_migrate);
+       P(se.statistics.nr_wakeups_local);
+       P(se.statistics.nr_wakeups_remote);
+       P(se.statistics.nr_wakeups_affine);
+       P(se.statistics.nr_wakeups_affine_attempts);
+       P(se.statistics.nr_wakeups_passive);
+       P(se.statistics.nr_wakeups_idle);
 
        {
                u64 avg_atom, avg_per_cpu;
@@ -491,31 +484,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 void proc_sched_set_task(struct task_struct *p)
 {
 #ifdef CONFIG_SCHEDSTATS
-       p->se.wait_max                          = 0;
-       p->se.wait_sum                          = 0;
-       p->se.wait_count                        = 0;
-       p->se.iowait_sum                        = 0;
-       p->se.iowait_count                      = 0;
-       p->se.sleep_max                         = 0;
-       p->se.sum_sleep_runtime                 = 0;
-       p->se.block_max                         = 0;
-       p->se.exec_max                          = 0;
-       p->se.slice_max                         = 0;
-       p->se.nr_migrations                     = 0;
-       p->se.nr_migrations_cold                = 0;
-       p->se.nr_failed_migrations_affine       = 0;
-       p->se.nr_failed_migrations_running      = 0;
-       p->se.nr_failed_migrations_hot          = 0;
-       p->se.nr_forced_migrations              = 0;
-       p->se.nr_wakeups                        = 0;
-       p->se.nr_wakeups_sync                   = 0;
-       p->se.nr_wakeups_migrate                = 0;
-       p->se.nr_wakeups_local                  = 0;
-       p->se.nr_wakeups_remote                 = 0;
-       p->se.nr_wakeups_affine                 = 0;
-       p->se.nr_wakeups_affine_attempts        = 0;
-       p->se.nr_wakeups_passive                = 0;
-       p->se.nr_wakeups_idle                   = 0;
-       p->sched_info.bkl_count                 = 0;
+       memset(&p->se.statistics, 0, sizeof(p->se.statistics));
 #endif
 }
index 5a5ea2cd924fa8494abfa21f8203f919f40ff1ca..217e4a9393e42c2f5dfdfae058625601c15e235b 100644 (file)
@@ -35,8 +35,8 @@
  * (to see the precise effective timeslice length of your workload,
  *  run vmstat and monitor the context-switches (cs) field)
  */
-unsigned int sysctl_sched_latency = 5000000ULL;
-unsigned int normalized_sysctl_sched_latency = 5000000ULL;
+unsigned int sysctl_sched_latency = 6000000ULL;
+unsigned int normalized_sysctl_sched_latency = 6000000ULL;
 
 /*
  * The initial- and re-scaling of tunables is configurable
@@ -52,15 +52,15 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling
 
 /*
  * Minimal preemption granularity for CPU-bound tasks:
- * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
+ * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds)
  */
-unsigned int sysctl_sched_min_granularity = 1000000ULL;
-unsigned int normalized_sysctl_sched_min_granularity = 1000000ULL;
+unsigned int sysctl_sched_min_granularity = 2000000ULL;
+unsigned int normalized_sysctl_sched_min_granularity = 2000000ULL;
 
 /*
  * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
  */
-static unsigned int sched_nr_latency = 5;
+static unsigned int sched_nr_latency = 3;
 
 /*
  * After fork, child runs first. If set to 0 (default) then
@@ -505,7 +505,8 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
 {
        unsigned long delta_exec_weighted;
 
-       schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max));
+       schedstat_set(curr->statistics.exec_max,
+                     max((u64)delta_exec, curr->statistics.exec_max));
 
        curr->sum_exec_runtime += delta_exec;
        schedstat_add(cfs_rq, exec_clock, delta_exec);
@@ -548,7 +549,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
 static inline void
 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-       schedstat_set(se->wait_start, rq_of(cfs_rq)->clock);
+       schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock);
 }
 
 /*
@@ -567,18 +568,18 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 static void
 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-       schedstat_set(se->wait_max, max(se->wait_max,
-                       rq_of(cfs_rq)->clock - se->wait_start));
-       schedstat_set(se->wait_count, se->wait_count + 1);
-       schedstat_set(se->wait_sum, se->wait_sum +
-                       rq_of(cfs_rq)->clock - se->wait_start);
+       schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
+                       rq_of(cfs_rq)->clock - se->statistics.wait_start));
+       schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
+       schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
+                       rq_of(cfs_rq)->clock - se->statistics.wait_start);
 #ifdef CONFIG_SCHEDSTATS
        if (entity_is_task(se)) {
                trace_sched_stat_wait(task_of(se),
-                       rq_of(cfs_rq)->clock - se->wait_start);
+                       rq_of(cfs_rq)->clock - se->statistics.wait_start);
        }
 #endif
-       schedstat_set(se->wait_start, 0);
+       schedstat_set(se->statistics.wait_start, 0);
 }
 
 static inline void
@@ -657,39 +658,39 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
        if (entity_is_task(se))
                tsk = task_of(se);
 
-       if (se->sleep_start) {
-               u64 delta = rq_of(cfs_rq)->clock - se->sleep_start;
+       if (se->statistics.sleep_start) {
+               u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start;
 
                if ((s64)delta < 0)
                        delta = 0;
 
-               if (unlikely(delta > se->sleep_max))
-                       se->sleep_max = delta;
+               if (unlikely(delta > se->statistics.sleep_max))
+                       se->statistics.sleep_max = delta;
 
-               se->sleep_start = 0;
-               se->sum_sleep_runtime += delta;
+               se->statistics.sleep_start = 0;
+               se->statistics.sum_sleep_runtime += delta;
 
                if (tsk) {
                        account_scheduler_latency(tsk, delta >> 10, 1);
                        trace_sched_stat_sleep(tsk, delta);
                }
        }
-       if (se->block_start) {
-               u64 delta = rq_of(cfs_rq)->clock - se->block_start;
+       if (se->statistics.block_start) {
+               u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start;
 
                if ((s64)delta < 0)
                        delta = 0;
 
-               if (unlikely(delta > se->block_max))
-                       se->block_max = delta;
+               if (unlikely(delta > se->statistics.block_max))
+                       se->statistics.block_max = delta;
 
-               se->block_start = 0;
-               se->sum_sleep_runtime += delta;
+               se->statistics.block_start = 0;
+               se->statistics.sum_sleep_runtime += delta;
 
                if (tsk) {
                        if (tsk->in_iowait) {
-                               se->iowait_sum += delta;
-                               se->iowait_count++;
+                               se->statistics.iowait_sum += delta;
+                               se->statistics.iowait_count++;
                                trace_sched_stat_iowait(tsk, delta);
                        }
 
@@ -737,19 +738,9 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
                vruntime += sched_vslice(cfs_rq, se);
 
        /* sleeps up to a single latency don't count. */
-       if (!initial && sched_feat(FAIR_SLEEPERS)) {
+       if (!initial) {
                unsigned long thresh = sysctl_sched_latency;
 
-               /*
-                * Convert the sleeper threshold into virtual time.
-                * SCHED_IDLE is a special sub-class.  We care about
-                * fairness only relative to other SCHED_IDLE tasks,
-                * all of which have the same weight.
-                */
-               if (sched_feat(NORMALIZED_SLEEPER) && (!entity_is_task(se) ||
-                                task_of(se)->policy != SCHED_IDLE))
-                       thresh = calc_delta_fair(thresh, se);
-
                /*
                 * Halve their sleep time's effect, to allow
                 * for a gentler effect of sleepers:
@@ -766,9 +757,6 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
        se->vruntime = vruntime;
 }
 
-#define ENQUEUE_WAKEUP 1
-#define ENQUEUE_MIGRATE 2
-
 static void
 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
@@ -776,7 +764,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
         * Update the normalized vruntime before updating min_vruntime
         * through callig update_curr().
         */
-       if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATE))
+       if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
                se->vruntime += cfs_rq->min_vruntime;
 
        /*
@@ -812,7 +800,7 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
 }
 
 static void
-dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
+dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
        /*
         * Update run-time statistics of the 'current'.
@@ -820,15 +808,15 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
        update_curr(cfs_rq);
 
        update_stats_dequeue(cfs_rq, se);
-       if (sleep) {
+       if (flags & DEQUEUE_SLEEP) {
 #ifdef CONFIG_SCHEDSTATS
                if (entity_is_task(se)) {
                        struct task_struct *tsk = task_of(se);
 
                        if (tsk->state & TASK_INTERRUPTIBLE)
-                               se->sleep_start = rq_of(cfs_rq)->clock;
+                               se->statistics.sleep_start = rq_of(cfs_rq)->clock;
                        if (tsk->state & TASK_UNINTERRUPTIBLE)
-                               se->block_start = rq_of(cfs_rq)->clock;
+                               se->statistics.block_start = rq_of(cfs_rq)->clock;
                }
 #endif
        }
@@ -845,7 +833,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
         * update can refer to the ->curr item and we need to reflect this
         * movement in our normalized position.
         */
-       if (!sleep)
+       if (!(flags & DEQUEUE_SLEEP))
                se->vruntime -= cfs_rq->min_vruntime;
 }
 
@@ -912,7 +900,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
         * when there are only lesser-weight tasks around):
         */
        if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
-               se->slice_max = max(se->slice_max,
+               se->statistics.slice_max = max(se->statistics.slice_max,
                        se->sum_exec_runtime - se->prev_sum_exec_runtime);
        }
 #endif
@@ -1054,16 +1042,10 @@ static inline void hrtick_update(struct rq *rq)
  * then put the task into the rbtree:
  */
 static void
-enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, bool head)
+enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 {
        struct cfs_rq *cfs_rq;
        struct sched_entity *se = &p->se;
-       int flags = 0;
-
-       if (wakeup)
-               flags |= ENQUEUE_WAKEUP;
-       if (p->state == TASK_WAKING)
-               flags |= ENQUEUE_MIGRATE;
 
        for_each_sched_entity(se) {
                if (se->on_rq)
@@ -1081,18 +1063,18 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, bool head)
  * decreased. We remove the task from the rbtree and
  * update the fair scheduling stats:
  */
-static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
+static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 {
        struct cfs_rq *cfs_rq;
        struct sched_entity *se = &p->se;
 
        for_each_sched_entity(se) {
                cfs_rq = cfs_rq_of(se);
-               dequeue_entity(cfs_rq, se, sleep);
+               dequeue_entity(cfs_rq, se, flags);
                /* Don't dequeue parent if it has other entities besides us */
                if (cfs_rq->load.weight)
                        break;
-               sleep = 1;
+               flags |= DEQUEUE_SLEEP;
        }
 
        hrtick_update(rq);
@@ -1240,7 +1222,6 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
 
 static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 {
-       struct task_struct *curr = current;
        unsigned long this_load, load;
        int idx, this_cpu, prev_cpu;
        unsigned long tl_per_task;
@@ -1255,18 +1236,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
        load      = source_load(prev_cpu, idx);
        this_load = target_load(this_cpu, idx);
 
-       if (sync) {
-              if (sched_feat(SYNC_LESS) &&
-                  (curr->se.avg_overlap > sysctl_sched_migration_cost ||
-                   p->se.avg_overlap > sysctl_sched_migration_cost))
-                      sync = 0;
-       } else {
-               if (sched_feat(SYNC_MORE) &&
-                   (curr->se.avg_overlap < sysctl_sched_migration_cost &&
-                    p->se.avg_overlap < sysctl_sched_migration_cost))
-                       sync = 1;
-       }
-
        /*
         * If sync wakeup then subtract the (maximum possible)
         * effect of the currently running task from the load
@@ -1306,7 +1275,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
        if (sync && balanced)
                return 1;
 
-       schedstat_inc(p, se.nr_wakeups_affine_attempts);
+       schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
        tl_per_task = cpu_avg_load_per_task(this_cpu);
 
        if (balanced ||
@@ -1318,7 +1287,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
                 * there is no bad imbalance.
                 */
                schedstat_inc(sd, ttwu_move_affine);
-               schedstat_inc(p, se.nr_wakeups_affine);
+               schedstat_inc(p, se.statistics.nr_wakeups_affine);
 
                return 1;
        }
@@ -1406,29 +1375,48 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 /*
  * Try and locate an idle CPU in the sched_domain.
  */
-static int
-select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target)
+static int select_idle_sibling(struct task_struct *p, int target)
 {
        int cpu = smp_processor_id();
        int prev_cpu = task_cpu(p);
+       struct sched_domain *sd;
        int i;
 
        /*
-        * If this domain spans both cpu and prev_cpu (see the SD_WAKE_AFFINE
-        * test in select_task_rq_fair) and the prev_cpu is idle then that's
-        * always a better target than the current cpu.
+        * If the task is going to be woken-up on this cpu and if it is
+        * already idle, then it is the right target.
         */
-       if (target == cpu && !cpu_rq(prev_cpu)->cfs.nr_running)
+       if (target == cpu && idle_cpu(cpu))
+               return cpu;
+
+       /*
+        * If the task is going to be woken-up on the cpu where it previously
+        * ran and if it is currently idle, then it the right target.
+        */
+       if (target == prev_cpu && idle_cpu(prev_cpu))
                return prev_cpu;
 
        /*
-        * Otherwise, iterate the domain and find an elegible idle cpu.
+        * Otherwise, iterate the domains and find an elegible idle cpu.
         */
-       for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
-               if (!cpu_rq(i)->cfs.nr_running) {
-                       target = i;
+       for_each_domain(target, sd) {
+               if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
                        break;
+
+               for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
+                       if (idle_cpu(i)) {
+                               target = i;
+                               break;
+                       }
                }
+
+               /*
+                * Lets stop looking for an idle sibling when we reached
+                * the domain that spans the current cpu and prev_cpu.
+                */
+               if (cpumask_test_cpu(cpu, sched_domain_span(sd)) &&
+                   cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
+                       break;
        }
 
        return target;
@@ -1445,7 +1433,8 @@ select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target)
  *
  * preempt must be disabled.
  */
-static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
+static int
+select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags)
 {
        struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
        int cpu = smp_processor_id();
@@ -1456,8 +1445,7 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
        int sync = wake_flags & WF_SYNC;
 
        if (sd_flag & SD_BALANCE_WAKE) {
-               if (sched_feat(AFFINE_WAKEUPS) &&
-                   cpumask_test_cpu(cpu, &p->cpus_allowed))
+               if (cpumask_test_cpu(cpu, &p->cpus_allowed))
                        want_affine = 1;
                new_cpu = prev_cpu;
        }
@@ -1491,34 +1479,13 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
                }
 
                /*
-                * While iterating the domains looking for a spanning
-                * WAKE_AFFINE domain, adjust the affine target to any idle cpu
-                * in cache sharing domains along the way.
+                * If both cpu and prev_cpu are part of this domain,
+                * cpu is a valid SD_WAKE_AFFINE target.
                 */
-               if (want_affine) {
-                       int target = -1;
-
-                       /*
-                        * If both cpu and prev_cpu are part of this domain,
-                        * cpu is a valid SD_WAKE_AFFINE target.
-                        */
-                       if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp)))
-                               target = cpu;
-
-                       /*
-                        * If there's an idle sibling in this domain, make that
-                        * the wake_affine target instead of the current cpu.
-                        */
-                       if (tmp->flags & SD_SHARE_PKG_RESOURCES)
-                               target = select_idle_sibling(p, tmp, target);
-
-                       if (target >= 0) {
-                               if (tmp->flags & SD_WAKE_AFFINE) {
-                                       affine_sd = tmp;
-                                       want_affine = 0;
-                               }
-                               cpu = target;
-                       }
+               if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
+                   cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
+                       affine_sd = tmp;
+                       want_affine = 0;
                }
 
                if (!want_sd && !want_affine)
@@ -1531,22 +1498,29 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
                        sd = tmp;
        }
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
        if (sched_feat(LB_SHARES_UPDATE)) {
                /*
                 * Pick the largest domain to update shares over
                 */
                tmp = sd;
-               if (affine_sd && (!tmp ||
-                                 cpumask_weight(sched_domain_span(affine_sd)) >
-                                 cpumask_weight(sched_domain_span(sd))))
+               if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight))
                        tmp = affine_sd;
 
-               if (tmp)
+               if (tmp) {
+                       raw_spin_unlock(&rq->lock);
                        update_shares(tmp);
+                       raw_spin_lock(&rq->lock);
+               }
        }
+#endif
 
-       if (affine_sd && wake_affine(affine_sd, p, sync))
-               return cpu;
+       if (affine_sd) {
+               if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
+                       return select_idle_sibling(p, cpu);
+               else
+                       return select_idle_sibling(p, prev_cpu);
+       }
 
        while (sd) {
                int load_idx = sd->forkexec_idx;
@@ -1576,10 +1550,10 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
 
                /* Now try balancing at a lower domain level of new_cpu */
                cpu = new_cpu;
-               weight = cpumask_weight(sched_domain_span(sd));
+               weight = sd->span_weight;
                sd = NULL;
                for_each_domain(cpu, tmp) {
-                       if (weight <= cpumask_weight(sched_domain_span(tmp)))
+                       if (weight <= tmp->span_weight)
                                break;
                        if (tmp->flags & sd_flag)
                                sd = tmp;
@@ -1591,63 +1565,26 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
 }
 #endif /* CONFIG_SMP */
 
-/*
- * Adaptive granularity
- *
- * se->avg_wakeup gives the average time a task runs until it does a wakeup,
- * with the limit of wakeup_gran -- when it never does a wakeup.
- *
- * So the smaller avg_wakeup is the faster we want this task to preempt,
- * but we don't want to treat the preemptee unfairly and therefore allow it
- * to run for at least the amount of time we'd like to run.
- *
- * NOTE: we use 2*avg_wakeup to increase the probability of actually doing one
- *
- * NOTE: we use *nr_running to scale with load, this nicely matches the
- *       degrading latency on load.
- */
-static unsigned long
-adaptive_gran(struct sched_entity *curr, struct sched_entity *se)
-{
-       u64 this_run = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
-       u64 expected_wakeup = 2*se->avg_wakeup * cfs_rq_of(se)->nr_running;
-       u64 gran = 0;
-
-       if (this_run < expected_wakeup)
-               gran = expected_wakeup - this_run;
-
-       return min_t(s64, gran, sysctl_sched_wakeup_granularity);
-}
-
 static unsigned long
 wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
 {
        unsigned long gran = sysctl_sched_wakeup_granularity;
 
-       if (cfs_rq_of(curr)->curr && sched_feat(ADAPTIVE_GRAN))
-               gran = adaptive_gran(curr, se);
-
        /*
         * Since its curr running now, convert the gran from real-time
         * to virtual-time in his units.
+        *
+        * By using 'se' instead of 'curr' we penalize light tasks, so
+        * they get preempted easier. That is, if 'se' < 'curr' then
+        * the resulting gran will be larger, therefore penalizing the
+        * lighter, if otoh 'se' > 'curr' then the resulting gran will
+        * be smaller, again penalizing the lighter task.
+        *
+        * This is especially important for buddies when the leftmost
+        * task is higher priority than the buddy.
         */
-       if (sched_feat(ASYM_GRAN)) {
-               /*
-                * By using 'se' instead of 'curr' we penalize light tasks, so
-                * they get preempted easier. That is, if 'se' < 'curr' then
-                * the resulting gran will be larger, therefore penalizing the
-                * lighter, if otoh 'se' > 'curr' then the resulting gran will
-                * be smaller, again penalizing the lighter task.
-                *
-                * This is especially important for buddies when the leftmost
-                * task is higher priority than the buddy.
-                */
-               if (unlikely(se->load.weight != NICE_0_LOAD))
-                       gran = calc_delta_fair(gran, se);
-       } else {
-               if (unlikely(curr->load.weight != NICE_0_LOAD))
-                       gran = calc_delta_fair(gran, curr);
-       }
+       if (unlikely(se->load.weight != NICE_0_LOAD))
+               gran = calc_delta_fair(gran, se);
 
        return gran;
 }
@@ -1705,7 +1642,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
        struct task_struct *curr = rq->curr;
        struct sched_entity *se = &curr->se, *pse = &p->se;
        struct cfs_rq *cfs_rq = task_cfs_rq(curr);
-       int sync = wake_flags & WF_SYNC;
        int scale = cfs_rq->nr_running >= sched_nr_latency;
 
        if (unlikely(rt_prio(p->prio)))
@@ -1738,14 +1674,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
        if (unlikely(curr->policy == SCHED_IDLE))
                goto preempt;
 
-       if (sched_feat(WAKEUP_SYNC) && sync)
-               goto preempt;
-
-       if (sched_feat(WAKEUP_OVERLAP) &&
-                       se->avg_overlap < sysctl_sched_migration_cost &&
-                       pse->avg_overlap < sysctl_sched_migration_cost)
-               goto preempt;
-
        if (!sched_feat(WAKEUP_PREEMPT))
                return;
 
@@ -1844,13 +1772,13 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
         * 3) are cache-hot on their current CPU.
         */
        if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
-               schedstat_inc(p, se.nr_failed_migrations_affine);
+               schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
                return 0;
        }
        *all_pinned = 0;
 
        if (task_running(rq, p)) {
-               schedstat_inc(p, se.nr_failed_migrations_running);
+               schedstat_inc(p, se.statistics.nr_failed_migrations_running);
                return 0;
        }
 
@@ -1866,14 +1794,14 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
 #ifdef CONFIG_SCHEDSTATS
                if (tsk_cache_hot) {
                        schedstat_inc(sd, lb_hot_gained[idle]);
-                       schedstat_inc(p, se.nr_forced_migrations);
+                       schedstat_inc(p, se.statistics.nr_forced_migrations);
                }
 #endif
                return 1;
        }
 
        if (tsk_cache_hot) {
-               schedstat_inc(p, se.nr_failed_migrations_hot);
+               schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
                return 0;
        }
        return 1;
@@ -2311,7 +2239,7 @@ unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
 
 unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
 {
-       unsigned long weight = cpumask_weight(sched_domain_span(sd));
+       unsigned long weight = sd->span_weight;
        unsigned long smt_gain = sd->smt_gain;
 
        smt_gain /= weight;
@@ -2344,7 +2272,7 @@ unsigned long scale_rt_power(int cpu)
 
 static void update_cpu_power(struct sched_domain *sd, int cpu)
 {
-       unsigned long weight = cpumask_weight(sched_domain_span(sd));
+       unsigned long weight = sd->span_weight;
        unsigned long power = SCHED_LOAD_SCALE;
        struct sched_group *sdg = sd->groups;
 
@@ -2870,6 +2798,8 @@ static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle)
        return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
 }
 
+static int active_load_balance_cpu_stop(void *data);
+
 /*
  * Check this_cpu to ensure it is balanced within domain. Attempt to move
  * tasks if there is an imbalance.
@@ -2959,8 +2889,9 @@ redo:
                if (need_active_balance(sd, sd_idle, idle)) {
                        raw_spin_lock_irqsave(&busiest->lock, flags);
 
-                       /* don't kick the migration_thread, if the curr
-                        * task on busiest cpu can't be moved to this_cpu
+                       /* don't kick the active_load_balance_cpu_stop,
+                        * if the curr task on busiest cpu can't be
+                        * moved to this_cpu
                         */
                        if (!cpumask_test_cpu(this_cpu,
                                              &busiest->curr->cpus_allowed)) {
@@ -2970,14 +2901,22 @@ redo:
                                goto out_one_pinned;
                        }
 
+                       /*
+                        * ->active_balance synchronizes accesses to
+                        * ->active_balance_work.  Once set, it's cleared
+                        * only after active load balance is finished.
+                        */
                        if (!busiest->active_balance) {
                                busiest->active_balance = 1;
                                busiest->push_cpu = this_cpu;
                                active_balance = 1;
                        }
                        raw_spin_unlock_irqrestore(&busiest->lock, flags);
+
                        if (active_balance)
-                               wake_up_process(busiest->migration_thread);
+                               stop_one_cpu_nowait(cpu_of(busiest),
+                                       active_load_balance_cpu_stop, busiest,
+                                       &busiest->active_balance_work);
 
                        /*
                         * We've kicked active balancing, reset the failure
@@ -3084,24 +3023,29 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
 }
 
 /*
- * active_load_balance is run by migration threads. It pushes running tasks
- * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
- * running on each physical CPU where possible, and avoids physical /
- * logical imbalances.
- *
- * Called with busiest_rq locked.
+ * active_load_balance_cpu_stop is run by cpu stopper. It pushes
+ * running tasks off the busiest CPU onto idle CPUs. It requires at
+ * least 1 task to be running on each physical CPU where possible, and
+ * avoids physical / logical imbalances.
  */
-static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
+static int active_load_balance_cpu_stop(void *data)
 {
+       struct rq *busiest_rq = data;
+       int busiest_cpu = cpu_of(busiest_rq);
        int target_cpu = busiest_rq->push_cpu;
+       struct rq *target_rq = cpu_rq(target_cpu);
        struct sched_domain *sd;
-       struct rq *target_rq;
+
+       raw_spin_lock_irq(&busiest_rq->lock);
+
+       /* make sure the requested cpu hasn't gone down in the meantime */
+       if (unlikely(busiest_cpu != smp_processor_id() ||
+                    !busiest_rq->active_balance))
+               goto out_unlock;
 
        /* Is there any task to move? */
        if (busiest_rq->nr_running <= 1)
-               return;
-
-       target_rq = cpu_rq(target_cpu);
+               goto out_unlock;
 
        /*
         * This condition is "impossible", if it occurs
@@ -3112,8 +3056,6 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
 
        /* move a task from busiest_rq to target_rq */
        double_lock_balance(busiest_rq, target_rq);
-       update_rq_clock(busiest_rq);
-       update_rq_clock(target_rq);
 
        /* Search for an sd spanning us and the target CPU. */
        for_each_domain(target_cpu, sd) {
@@ -3132,6 +3074,10 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
                        schedstat_inc(sd, alb_failed);
        }
        double_unlock_balance(busiest_rq, target_rq);
+out_unlock:
+       busiest_rq->active_balance = 0;
+       raw_spin_unlock_irq(&busiest_rq->lock);
+       return 0;
 }
 
 #ifdef CONFIG_NO_HZ
index d5059fd761d9bf49aca78e830a4fe9330bd464c9..83c66e8ad3ee314704456e14dfc23607d00c5f0d 100644 (file)
@@ -1,10 +1,3 @@
-/*
- * Disregards a certain amount of sleep time (sched_latency_ns) and
- * considers the task to be running during that period. This gives it
- * a service deficit on wakeup, allowing it to run sooner.
- */
-SCHED_FEAT(FAIR_SLEEPERS, 1)
-
 /*
  * Only give sleepers 50% of their service deficit. This allows
  * them to run sooner, but does not allow tons of sleepers to
@@ -12,13 +5,6 @@ SCHED_FEAT(FAIR_SLEEPERS, 1)
  */
 SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1)
 
-/*
- * By not normalizing the sleep time, heavy tasks get an effective
- * longer period, and lighter task an effective shorter period they
- * are considered running.
- */
-SCHED_FEAT(NORMALIZED_SLEEPER, 0)
-
 /*
  * Place new tasks ahead so that they do not starve already running
  * tasks
@@ -30,37 +16,6 @@ SCHED_FEAT(START_DEBIT, 1)
  */
 SCHED_FEAT(WAKEUP_PREEMPT, 1)
 
-/*
- * Compute wakeup_gran based on task behaviour, clipped to
- *  [0, sched_wakeup_gran_ns]
- */
-SCHED_FEAT(ADAPTIVE_GRAN, 1)
-
-/*
- * When converting the wakeup granularity to virtual time, do it such
- * that heavier tasks preempting a lighter task have an edge.
- */
-SCHED_FEAT(ASYM_GRAN, 1)
-
-/*
- * Always wakeup-preempt SYNC wakeups, see SYNC_WAKEUPS.
- */
-SCHED_FEAT(WAKEUP_SYNC, 0)
-
-/*
- * Wakeup preempt based on task behaviour. Tasks that do not overlap
- * don't get preempted.
- */
-SCHED_FEAT(WAKEUP_OVERLAP, 0)
-
-/*
- * Use the SYNC wakeup hint, pipes and the likes use this to indicate
- * the remote end is likely to consume the data we just wrote, and
- * therefore has cache benefit from being placed on the same cpu, see
- * also AFFINE_WAKEUPS.
- */
-SCHED_FEAT(SYNC_WAKEUPS, 1)
-
 /*
  * Based on load and program behaviour, see if it makes sense to place
  * a newly woken task on the same cpu as the task that woke it --
@@ -69,16 +24,6 @@ SCHED_FEAT(SYNC_WAKEUPS, 1)
  */
 SCHED_FEAT(AFFINE_WAKEUPS, 1)
 
-/*
- * Weaken SYNC hint based on overlap
- */
-SCHED_FEAT(SYNC_LESS, 1)
-
-/*
- * Add SYNC hint based on overlap
- */
-SCHED_FEAT(SYNC_MORE, 0)
-
 /*
  * Prefer to schedule the task we woke last (assuming it failed
  * wakeup-preemption), since its likely going to consume data we
index a8a6d8a50947f11e9bb1f8a9e782effffe4468f7..9fa0f402c87c2aa2bf8be7f404c6cfc27b64a865 100644 (file)
@@ -6,7 +6,8 @@
  */
 
 #ifdef CONFIG_SMP
-static int select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
+static int
+select_task_rq_idle(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
 {
        return task_cpu(p); /* IDLE tasks as never migrated */
 }
@@ -22,8 +23,7 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
 static struct task_struct *pick_next_task_idle(struct rq *rq)
 {
        schedstat_inc(rq, sched_goidle);
-       /* adjust the active tasks as we might go into a long sleep */
-       calc_load_account_active(rq);
+       calc_load_account_idle(rq);
        return rq->idle;
 }
 
@@ -32,7 +32,7 @@ static struct task_struct *pick_next_task_idle(struct rq *rq)
  * message if some code attempts to do it:
  */
 static void
-dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep)
+dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
 {
        raw_spin_unlock_irq(&rq->lock);
        printk(KERN_ERR "bad: scheduling from the idle thread!\n");
index b5b920ae2ea7fe83ca17d2c94d0a7b638574144c..8afb953e31c6c1ba9a263a78929a04a0d82ff2c7 100644 (file)
@@ -613,7 +613,7 @@ static void update_curr_rt(struct rq *rq)
        if (unlikely((s64)delta_exec < 0))
                delta_exec = 0;
 
-       schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));
+       schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec));
 
        curr->se.sum_exec_runtime += delta_exec;
        account_group_exec_runtime(curr, delta_exec);
@@ -888,20 +888,20 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
  * Adding/removing a task to/from a priority array:
  */
 static void
-enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, bool head)
+enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 {
        struct sched_rt_entity *rt_se = &p->rt;
 
-       if (wakeup)
+       if (flags & ENQUEUE_WAKEUP)
                rt_se->timeout = 0;
 
-       enqueue_rt_entity(rt_se, head);
+       enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
 
        if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
                enqueue_pushable_task(rq, p);
 }
 
-static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
+static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 {
        struct sched_rt_entity *rt_se = &p->rt;
 
@@ -948,10 +948,9 @@ static void yield_task_rt(struct rq *rq)
 #ifdef CONFIG_SMP
 static int find_lowest_rq(struct task_struct *task);
 
-static int select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
+static int
+select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
 {
-       struct rq *rq = task_rq(p);
-
        if (sd_flag != SD_BALANCE_WAKE)
                return smp_processor_id();
 
index 9bb9fb1bd79c8b07da0dba482244d90e61aa7b76..ef51d1fcf5e6ddc818b5c85993801508e5ae9684 100644 (file)
-/* Copyright 2008, 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation.
- * GPL v2 and any later version.
+/*
+ * kernel/stop_machine.c
+ *
+ * Copyright (C) 2008, 2005    IBM Corporation.
+ * Copyright (C) 2008, 2005    Rusty Russell rusty@rustcorp.com.au
+ * Copyright (C) 2010          SUSE Linux Products GmbH
+ * Copyright (C) 2010          Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2 and any later version.
  */
+#include <linux/completion.h>
 #include <linux/cpu.h>
-#include <linux/err.h>
+#include <linux/init.h>
 #include <linux/kthread.h>
 #include <linux/module.h>
+#include <linux/percpu.h>
 #include <linux/sched.h>
 #include <linux/stop_machine.h>
-#include <linux/syscalls.h>
 #include <linux/interrupt.h>
+#include <linux/kallsyms.h>
 
 #include <asm/atomic.h>
-#include <asm/uaccess.h>
+
+/*
+ * Structure to determine completion condition and record errors.  May
+ * be shared by works on different cpus.
+ */
+struct cpu_stop_done {
+       atomic_t                nr_todo;        /* nr left to execute */
+       bool                    executed;       /* actually executed? */
+       int                     ret;            /* collected return value */
+       struct completion       completion;     /* fired if nr_todo reaches 0 */
+};
+
+/* the actual stopper, one per every possible cpu, enabled on online cpus */
+struct cpu_stopper {
+       spinlock_t              lock;
+       struct list_head        works;          /* list of pending works */
+       struct task_struct      *thread;        /* stopper thread */
+       bool                    enabled;        /* is this stopper enabled? */
+};
+
+static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
+
+static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
+{
+       memset(done, 0, sizeof(*done));
+       atomic_set(&done->nr_todo, nr_todo);
+       init_completion(&done->completion);
+}
+
+/* signal completion unless @done is NULL */
+static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed)
+{
+       if (done) {
+               if (executed)
+                       done->executed = true;
+               if (atomic_dec_and_test(&done->nr_todo))
+                       complete(&done->completion);
+       }
+}
+
+/* queue @work to @stopper.  if offline, @work is completed immediately */
+static void cpu_stop_queue_work(struct cpu_stopper *stopper,
+                               struct cpu_stop_work *work)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&stopper->lock, flags);
+
+       if (stopper->enabled) {
+               list_add_tail(&work->list, &stopper->works);
+               wake_up_process(stopper->thread);
+       } else
+               cpu_stop_signal_done(work->done, false);
+
+       spin_unlock_irqrestore(&stopper->lock, flags);
+}
+
+/**
+ * stop_one_cpu - stop a cpu
+ * @cpu: cpu to stop
+ * @fn: function to execute
+ * @arg: argument to @fn
+ *
+ * Execute @fn(@arg) on @cpu.  @fn is run in a process context with
+ * the highest priority preempting any task on the cpu and
+ * monopolizing it.  This function returns after the execution is
+ * complete.
+ *
+ * This function doesn't guarantee @cpu stays online till @fn
+ * completes.  If @cpu goes down in the middle, execution may happen
+ * partially or fully on different cpus.  @fn should either be ready
+ * for that or the caller should ensure that @cpu stays online until
+ * this function completes.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * -ENOENT if @fn(@arg) was not executed because @cpu was offline;
+ * otherwise, the return value of @fn.
+ */
+int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
+{
+       struct cpu_stop_done done;
+       struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done };
+
+       cpu_stop_init_done(&done, 1);
+       cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), &work);
+       wait_for_completion(&done.completion);
+       return done.executed ? done.ret : -ENOENT;
+}
+
+/**
+ * stop_one_cpu_nowait - stop a cpu but don't wait for completion
+ * @cpu: cpu to stop
+ * @fn: function to execute
+ * @arg: argument to @fn
+ *
+ * Similar to stop_one_cpu() but doesn't wait for completion.  The
+ * caller is responsible for ensuring @work_buf is currently unused
+ * and will remain untouched until stopper starts executing @fn.
+ *
+ * CONTEXT:
+ * Don't care.
+ */
+void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
+                       struct cpu_stop_work *work_buf)
+{
+       *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, };
+       cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), work_buf);
+}
+
+/* static data for stop_cpus */
+static DEFINE_MUTEX(stop_cpus_mutex);
+static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work);
+
+int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
+{
+       struct cpu_stop_work *work;
+       struct cpu_stop_done done;
+       unsigned int cpu;
+
+       /* initialize works and done */
+       for_each_cpu(cpu, cpumask) {
+               work = &per_cpu(stop_cpus_work, cpu);
+               work->fn = fn;
+               work->arg = arg;
+               work->done = &done;
+       }
+       cpu_stop_init_done(&done, cpumask_weight(cpumask));
+
+       /*
+        * Disable preemption while queueing to avoid getting
+        * preempted by a stopper which might wait for other stoppers
+        * to enter @fn which can lead to deadlock.
+        */
+       preempt_disable();
+       for_each_cpu(cpu, cpumask)
+               cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu),
+                                   &per_cpu(stop_cpus_work, cpu));
+       preempt_enable();
+
+       wait_for_completion(&done.completion);
+       return done.executed ? done.ret : -ENOENT;
+}
+
+/**
+ * stop_cpus - stop multiple cpus
+ * @cpumask: cpus to stop
+ * @fn: function to execute
+ * @arg: argument to @fn
+ *
+ * Execute @fn(@arg) on online cpus in @cpumask.  On each target cpu,
+ * @fn is run in a process context with the highest priority
+ * preempting any task on the cpu and monopolizing it.  This function
+ * returns after all executions are complete.
+ *
+ * This function doesn't guarantee the cpus in @cpumask stay online
+ * till @fn completes.  If some cpus go down in the middle, execution
+ * on the cpu may happen partially or fully on different cpus.  @fn
+ * should either be ready for that or the caller should ensure that
+ * the cpus stay online until this function completes.
+ *
+ * All stop_cpus() calls are serialized making it safe for @fn to wait
+ * for all cpus to start executing it.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * -ENOENT if @fn(@arg) was not executed at all because all cpus in
+ * @cpumask were offline; otherwise, 0 if all executions of @fn
+ * returned 0, any non zero return value if any returned non zero.
+ */
+int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
+{
+       int ret;
+
+       /* static works are used, process one request at a time */
+       mutex_lock(&stop_cpus_mutex);
+       ret = __stop_cpus(cpumask, fn, arg);
+       mutex_unlock(&stop_cpus_mutex);
+       return ret;
+}
+
+/**
+ * try_stop_cpus - try to stop multiple cpus
+ * @cpumask: cpus to stop
+ * @fn: function to execute
+ * @arg: argument to @fn
+ *
+ * Identical to stop_cpus() except that it fails with -EAGAIN if
+ * someone else is already using the facility.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * -EAGAIN if someone else is already stopping cpus, -ENOENT if
+ * @fn(@arg) was not executed at all because all cpus in @cpumask were
+ * offline; otherwise, 0 if all executions of @fn returned 0, any non
+ * zero return value if any returned non zero.
+ */
+int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
+{
+       int ret;
+
+       /* static works are used, process one request at a time */
+       if (!mutex_trylock(&stop_cpus_mutex))
+               return -EAGAIN;
+       ret = __stop_cpus(cpumask, fn, arg);
+       mutex_unlock(&stop_cpus_mutex);
+       return ret;
+}
+
+static int cpu_stopper_thread(void *data)
+{
+       struct cpu_stopper *stopper = data;
+       struct cpu_stop_work *work;
+       int ret;
+
+repeat:
+       set_current_state(TASK_INTERRUPTIBLE);  /* mb paired w/ kthread_stop */
+
+       if (kthread_should_stop()) {
+               __set_current_state(TASK_RUNNING);
+               return 0;
+       }
+
+       work = NULL;
+       spin_lock_irq(&stopper->lock);
+       if (!list_empty(&stopper->works)) {
+               work = list_first_entry(&stopper->works,
+                                       struct cpu_stop_work, list);
+               list_del_init(&work->list);
+       }
+       spin_unlock_irq(&stopper->lock);
+
+       if (work) {
+               cpu_stop_fn_t fn = work->fn;
+               void *arg = work->arg;
+               struct cpu_stop_done *done = work->done;
+               char ksym_buf[KSYM_NAME_LEN];
+
+               __set_current_state(TASK_RUNNING);
+
+               /* cpu stop callbacks are not allowed to sleep */
+               preempt_disable();
+
+               ret = fn(arg);
+               if (ret)
+                       done->ret = ret;
+
+               /* restore preemption and check it's still balanced */
+               preempt_enable();
+               WARN_ONCE(preempt_count(),
+                         "cpu_stop: %s(%p) leaked preempt count\n",
+                         kallsyms_lookup((unsigned long)fn, NULL, NULL, NULL,
+                                         ksym_buf), arg);
+
+               cpu_stop_signal_done(done, true);
+       } else
+               schedule();
+
+       goto repeat;
+}
+
+/* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */
+static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
+                                          unsigned long action, void *hcpu)
+{
+       struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
+       unsigned int cpu = (unsigned long)hcpu;
+       struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
+       struct cpu_stop_work *work;
+       struct task_struct *p;
+
+       switch (action & ~CPU_TASKS_FROZEN) {
+       case CPU_UP_PREPARE:
+               BUG_ON(stopper->thread || stopper->enabled ||
+                      !list_empty(&stopper->works));
+               p = kthread_create(cpu_stopper_thread, stopper, "migration/%d",
+                                  cpu);
+               if (IS_ERR(p))
+                       return NOTIFY_BAD;
+               sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
+               get_task_struct(p);
+               stopper->thread = p;
+               break;
+
+       case CPU_ONLINE:
+               kthread_bind(stopper->thread, cpu);
+               /* strictly unnecessary, as first user will wake it */
+               wake_up_process(stopper->thread);
+               /* mark enabled */
+               spin_lock_irq(&stopper->lock);
+               stopper->enabled = true;
+               spin_unlock_irq(&stopper->lock);
+               break;
+
+#ifdef CONFIG_HOTPLUG_CPU
+       case CPU_UP_CANCELED:
+       case CPU_DEAD:
+               /* kill the stopper */
+               kthread_stop(stopper->thread);
+               /* drain remaining works */
+               spin_lock_irq(&stopper->lock);
+               list_for_each_entry(work, &stopper->works, list)
+                       cpu_stop_signal_done(work->done, false);
+               stopper->enabled = false;
+               spin_unlock_irq(&stopper->lock);
+               /* release the stopper */
+               put_task_struct(stopper->thread);
+               stopper->thread = NULL;
+               break;
+#endif
+       }
+
+       return NOTIFY_OK;
+}
+
+/*
+ * Give it a higher priority so that cpu stopper is available to other
+ * cpu notifiers.  It currently shares the same priority as sched
+ * migration_notifier.
+ */
+static struct notifier_block __cpuinitdata cpu_stop_cpu_notifier = {
+       .notifier_call  = cpu_stop_cpu_callback,
+       .priority       = 10,
+};
+
+static int __init cpu_stop_init(void)
+{
+       void *bcpu = (void *)(long)smp_processor_id();
+       unsigned int cpu;
+       int err;
+
+       for_each_possible_cpu(cpu) {
+               struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
+
+               spin_lock_init(&stopper->lock);
+               INIT_LIST_HEAD(&stopper->works);
+       }
+
+       /* start one for the boot cpu */
+       err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE,
+                                   bcpu);
+       BUG_ON(err == NOTIFY_BAD);
+       cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu);
+       register_cpu_notifier(&cpu_stop_cpu_notifier);
+
+       return 0;
+}
+early_initcall(cpu_stop_init);
+
+#ifdef CONFIG_STOP_MACHINE
 
 /* This controls the threads on each CPU. */
 enum stopmachine_state {
@@ -26,174 +390,94 @@ enum stopmachine_state {
        /* Exit */
        STOPMACHINE_EXIT,
 };
-static enum stopmachine_state state;
 
 struct stop_machine_data {
-       int (*fn)(void *);
-       void *data;
-       int fnret;
+       int                     (*fn)(void *);
+       void                    *data;
+       /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
+       unsigned int            num_threads;
+       const struct cpumask    *active_cpus;
+
+       enum stopmachine_state  state;
+       atomic_t                thread_ack;
 };
 
-/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
-static unsigned int num_threads;
-static atomic_t thread_ack;
-static DEFINE_MUTEX(lock);
-/* setup_lock protects refcount, stop_machine_wq and stop_machine_work. */
-static DEFINE_MUTEX(setup_lock);
-/* Users of stop_machine. */
-static int refcount;
-static struct workqueue_struct *stop_machine_wq;
-static struct stop_machine_data active, idle;
-static const struct cpumask *active_cpus;
-static void __percpu *stop_machine_work;
-
-static void set_state(enum stopmachine_state newstate)
+static void set_state(struct stop_machine_data *smdata,
+                     enum stopmachine_state newstate)
 {
        /* Reset ack counter. */
-       atomic_set(&thread_ack, num_threads);
+       atomic_set(&smdata->thread_ack, smdata->num_threads);
        smp_wmb();
-       state = newstate;
+       smdata->state = newstate;
 }
 
 /* Last one to ack a state moves to the next state. */
-static void ack_state(void)
+static void ack_state(struct stop_machine_data *smdata)
 {
-       if (atomic_dec_and_test(&thread_ack))
-               set_state(state + 1);
+       if (atomic_dec_and_test(&smdata->thread_ack))
+               set_state(smdata, smdata->state + 1);
 }
 
-/* This is the actual function which stops the CPU. It runs
- * in the context of a dedicated stopmachine workqueue. */
-static void stop_cpu(struct work_struct *unused)
+/* This is the cpu_stop function which stops the CPU. */
+static int stop_machine_cpu_stop(void *data)
 {
+       struct stop_machine_data *smdata = data;
        enum stopmachine_state curstate = STOPMACHINE_NONE;
-       struct stop_machine_data *smdata = &idle;
-       int cpu = smp_processor_id();
-       int err;
+       int cpu = smp_processor_id(), err = 0;
+       bool is_active;
+
+       if (!smdata->active_cpus)
+               is_active = cpu == cpumask_first(cpu_online_mask);
+       else
+               is_active = cpumask_test_cpu(cpu, smdata->active_cpus);
 
-       if (!active_cpus) {
-               if (cpu == cpumask_first(cpu_online_mask))
-                       smdata = &active;
-       } else {
-               if (cpumask_test_cpu(cpu, active_cpus))
-                       smdata = &active;
-       }
        /* Simple state machine */
        do {
                /* Chill out and ensure we re-read stopmachine_state. */
                cpu_relax();
-               if (state != curstate) {
-                       curstate = state;
+               if (smdata->state != curstate) {
+                       curstate = smdata->state;
                        switch (curstate) {
                        case STOPMACHINE_DISABLE_IRQ:
                                local_irq_disable();
                                hard_irq_disable();
                                break;
                        case STOPMACHINE_RUN:
-                               /* On multiple CPUs only a single error code
-                                * is needed to tell that something failed. */
-                               err = smdata->fn(smdata->data);
-                               if (err)
-                                       smdata->fnret = err;
+                               if (is_active)
+                                       err = smdata->fn(smdata->data);
                                break;
                        default:
                                break;
                        }
-                       ack_state();
+                       ack_state(smdata);
                }
        } while (curstate != STOPMACHINE_EXIT);
 
        local_irq_enable();
+       return err;
 }
 
-/* Callback for CPUs which aren't supposed to do anything. */
-static int chill(void *unused)
-{
-       return 0;
-}
-
-int stop_machine_create(void)
-{
-       mutex_lock(&setup_lock);
-       if (refcount)
-               goto done;
-       stop_machine_wq = create_rt_workqueue("kstop");
-       if (!stop_machine_wq)
-               goto err_out;
-       stop_machine_work = alloc_percpu(struct work_struct);
-       if (!stop_machine_work)
-               goto err_out;
-done:
-       refcount++;
-       mutex_unlock(&setup_lock);
-       return 0;
-
-err_out:
-       if (stop_machine_wq)
-               destroy_workqueue(stop_machine_wq);
-       mutex_unlock(&setup_lock);
-       return -ENOMEM;
-}
-EXPORT_SYMBOL_GPL(stop_machine_create);
-
-void stop_machine_destroy(void)
-{
-       mutex_lock(&setup_lock);
-       refcount--;
-       if (refcount)
-               goto done;
-       destroy_workqueue(stop_machine_wq);
-       free_percpu(stop_machine_work);
-done:
-       mutex_unlock(&setup_lock);
-}
-EXPORT_SYMBOL_GPL(stop_machine_destroy);
-
 int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
 {
-       struct work_struct *sm_work;
-       int i, ret;
-
-       /* Set up initial state. */
-       mutex_lock(&lock);
-       num_threads = num_online_cpus();
-       active_cpus = cpus;
-       active.fn = fn;
-       active.data = data;
-       active.fnret = 0;
-       idle.fn = chill;
-       idle.data = NULL;
-
-       set_state(STOPMACHINE_PREPARE);
-
-       /* Schedule the stop_cpu work on all cpus: hold this CPU so one
-        * doesn't hit this CPU until we're ready. */
-       get_cpu();
-       for_each_online_cpu(i) {
-               sm_work = per_cpu_ptr(stop_machine_work, i);
-               INIT_WORK(sm_work, stop_cpu);
-               queue_work_on(i, stop_machine_wq, sm_work);
-       }
-       /* This will release the thread on our CPU. */
-       put_cpu();
-       flush_workqueue(stop_machine_wq);
-       ret = active.fnret;
-       mutex_unlock(&lock);
-       return ret;
+       struct stop_machine_data smdata = { .fn = fn, .data = data,
+                                           .num_threads = num_online_cpus(),
+                                           .active_cpus = cpus };
+
+       /* Set the initial state and stop all online cpus. */
+       set_state(&smdata, STOPMACHINE_PREPARE);
+       return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata);
 }
 
 int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
 {
        int ret;
 
-       ret = stop_machine_create();
-       if (ret)
-               return ret;
        /* No CPUs can come up or down during this. */
        get_online_cpus();
        ret = __stop_machine(fn, data, cpus);
        put_online_cpus();
-       stop_machine_destroy();
        return ret;
 }
 EXPORT_SYMBOL_GPL(stop_machine);
+
+#endif /* CONFIG_STOP_MACHINE */
index f992762d7f51c9e187160f8ee78543a5933cf679..1d7b9bc1c0340e8deccbc5df1418fd71889d837d 100644 (file)
@@ -150,14 +150,32 @@ static void tick_nohz_update_jiffies(ktime_t now)
        touch_softlockup_watchdog();
 }
 
+/*
+ * Updates the per cpu time idle statistics counters
+ */
+static void
+update_ts_time_stats(struct tick_sched *ts, ktime_t now, u64 *last_update_time)
+{
+       ktime_t delta;
+
+       if (ts->idle_active) {
+               delta = ktime_sub(now, ts->idle_entrytime);
+               ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
+               if (nr_iowait_cpu() > 0)
+                       ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
+               ts->idle_entrytime = now;
+       }
+
+       if (last_update_time)
+               *last_update_time = ktime_to_us(now);
+
+}
+
 static void tick_nohz_stop_idle(int cpu, ktime_t now)
 {
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
-       ktime_t delta;
 
-       delta = ktime_sub(now, ts->idle_entrytime);
-       ts->idle_lastupdate = now;
-       ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
+       update_ts_time_stats(ts, now, NULL);
        ts->idle_active = 0;
 
        sched_clock_idle_wakeup_event(0);
@@ -165,20 +183,32 @@ static void tick_nohz_stop_idle(int cpu, ktime_t now)
 
 static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
 {
-       ktime_t now, delta;
+       ktime_t now;
 
        now = ktime_get();
-       if (ts->idle_active) {
-               delta = ktime_sub(now, ts->idle_entrytime);
-               ts->idle_lastupdate = now;
-               ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
-       }
+
+       update_ts_time_stats(ts, now, NULL);
+
        ts->idle_entrytime = now;
        ts->idle_active = 1;
        sched_clock_idle_sleep_event();
        return now;
 }
 
+/**
+ * get_cpu_idle_time_us - get the total idle time of a cpu
+ * @cpu: CPU number to query
+ * @last_update_time: variable to store update time in
+ *
+ * Return the cummulative idle time (since boot) for a given
+ * CPU, in microseconds. The idle time returned includes
+ * the iowait time (unlike what "top" and co report).
+ *
+ * This time is measured via accounting rather than sampling,
+ * and is as accurate as ktime_get() is.
+ *
+ * This function returns -1 if NOHZ is not enabled.
+ */
 u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
 {
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
@@ -186,15 +216,38 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
        if (!tick_nohz_enabled)
                return -1;
 
-       if (ts->idle_active)
-               *last_update_time = ktime_to_us(ts->idle_lastupdate);
-       else
-               *last_update_time = ktime_to_us(ktime_get());
+       update_ts_time_stats(ts, ktime_get(), last_update_time);
 
        return ktime_to_us(ts->idle_sleeptime);
 }
 EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
 
+/*
+ * get_cpu_iowait_time_us - get the total iowait time of a cpu
+ * @cpu: CPU number to query
+ * @last_update_time: variable to store update time in
+ *
+ * Return the cummulative iowait time (since boot) for a given
+ * CPU, in microseconds.
+ *
+ * This time is measured via accounting rather than sampling,
+ * and is as accurate as ktime_get() is.
+ *
+ * This function returns -1 if NOHZ is not enabled.
+ */
+u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
+{
+       struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+
+       if (!tick_nohz_enabled)
+               return -1;
+
+       update_ts_time_stats(ts, ktime_get(), last_update_time);
+
+       return ktime_to_us(ts->iowait_sleeptime);
+}
+EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
+
 /**
  * tick_nohz_stop_sched_tick - stop the idle tick from the idle task
  *
@@ -262,6 +315,9 @@ void tick_nohz_stop_sched_tick(int inidle)
                goto end;
        }
 
+       if (nohz_ratelimit(cpu))
+               goto end;
+
        ts->idle_calls++;
        /* Read jiffies and the time when jiffies were updated last */
        do {
index 1a4a7dd787779345eafb02bc5781973cf68a1d6a..ab8f5e33fa92c76db813d1419e6a339f3a7aca52 100644 (file)
@@ -176,6 +176,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
                P_ns(idle_waketime);
                P_ns(idle_exittime);
                P_ns(idle_sleeptime);
+               P_ns(iowait_sleeptime);
                P(last_jiffies);
                P(next_jiffies);
                P_ns(idle_expires);
index b3bc91a3f510d089d7ce69ad4b294655dce53ac7..36ea2b65dcdc65a281bef8e7fa1acf0a44434003 100644 (file)
@@ -675,28 +675,33 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
        }
 }
 
-static void blk_add_trace_rq_abort(struct request_queue *q, struct request *rq)
+static void blk_add_trace_rq_abort(void *ignore,
+                                  struct request_queue *q, struct request *rq)
 {
        blk_add_trace_rq(q, rq, BLK_TA_ABORT);
 }
 
-static void blk_add_trace_rq_insert(struct request_queue *q, struct request *rq)
+static void blk_add_trace_rq_insert(void *ignore,
+                                   struct request_queue *q, struct request *rq)
 {
        blk_add_trace_rq(q, rq, BLK_TA_INSERT);
 }
 
-static void blk_add_trace_rq_issue(struct request_queue *q, struct request *rq)
+static void blk_add_trace_rq_issue(void *ignore,
+                                  struct request_queue *q, struct request *rq)
 {
        blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
 }
 
-static void blk_add_trace_rq_requeue(struct request_queue *q,
+static void blk_add_trace_rq_requeue(void *ignore,
+                                    struct request_queue *q,
                                     struct request *rq)
 {
        blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
 }
 
-static void blk_add_trace_rq_complete(struct request_queue *q,
+static void blk_add_trace_rq_complete(void *ignore,
+                                     struct request_queue *q,
                                      struct request *rq)
 {
        blk_add_trace_rq(q, rq, BLK_TA_COMPLETE);
@@ -724,34 +729,40 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
                        !bio_flagged(bio, BIO_UPTODATE), 0, NULL);
 }
 
-static void blk_add_trace_bio_bounce(struct request_queue *q, struct bio *bio)
+static void blk_add_trace_bio_bounce(void *ignore,
+                                    struct request_queue *q, struct bio *bio)
 {
        blk_add_trace_bio(q, bio, BLK_TA_BOUNCE);
 }
 
-static void blk_add_trace_bio_complete(struct request_queue *q, struct bio *bio)
+static void blk_add_trace_bio_complete(void *ignore,
+                                      struct request_queue *q, struct bio *bio)
 {
        blk_add_trace_bio(q, bio, BLK_TA_COMPLETE);
 }
 
-static void blk_add_trace_bio_backmerge(struct request_queue *q,
+static void blk_add_trace_bio_backmerge(void *ignore,
+                                       struct request_queue *q,
                                        struct bio *bio)
 {
        blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
 }
 
-static void blk_add_trace_bio_frontmerge(struct request_queue *q,
+static void blk_add_trace_bio_frontmerge(void *ignore,
+                                        struct request_queue *q,
                                         struct bio *bio)
 {
        blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
 }
 
-static void blk_add_trace_bio_queue(struct request_queue *q, struct bio *bio)
+static void blk_add_trace_bio_queue(void *ignore,
+                                   struct request_queue *q, struct bio *bio)
 {
        blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
 }
 
-static void blk_add_trace_getrq(struct request_queue *q,
+static void blk_add_trace_getrq(void *ignore,
+                               struct request_queue *q,
                                struct bio *bio, int rw)
 {
        if (bio)
@@ -765,7 +776,8 @@ static void blk_add_trace_getrq(struct request_queue *q,
 }
 
 
-static void blk_add_trace_sleeprq(struct request_queue *q,
+static void blk_add_trace_sleeprq(void *ignore,
+                                 struct request_queue *q,
                                  struct bio *bio, int rw)
 {
        if (bio)
@@ -779,7 +791,7 @@ static void blk_add_trace_sleeprq(struct request_queue *q,
        }
 }
 
-static void blk_add_trace_plug(struct request_queue *q)
+static void blk_add_trace_plug(void *ignore, struct request_queue *q)
 {
        struct blk_trace *bt = q->blk_trace;
 
@@ -787,7 +799,7 @@ static void blk_add_trace_plug(struct request_queue *q)
                __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
 }
 
-static void blk_add_trace_unplug_io(struct request_queue *q)
+static void blk_add_trace_unplug_io(void *ignore, struct request_queue *q)
 {
        struct blk_trace *bt = q->blk_trace;
 
@@ -800,7 +812,7 @@ static void blk_add_trace_unplug_io(struct request_queue *q)
        }
 }
 
-static void blk_add_trace_unplug_timer(struct request_queue *q)
+static void blk_add_trace_unplug_timer(void *ignore, struct request_queue *q)
 {
        struct blk_trace *bt = q->blk_trace;
 
@@ -813,7 +825,8 @@ static void blk_add_trace_unplug_timer(struct request_queue *q)
        }
 }
 
-static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
+static void blk_add_trace_split(void *ignore,
+                               struct request_queue *q, struct bio *bio,
                                unsigned int pdu)
 {
        struct blk_trace *bt = q->blk_trace;
@@ -839,8 +852,9 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
  *     it spans a stripe (or similar). Add a trace for that action.
  *
  **/
-static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
-                                      dev_t dev, sector_t from)
+static void blk_add_trace_remap(void *ignore,
+                               struct request_queue *q, struct bio *bio,
+                               dev_t dev, sector_t from)
 {
        struct blk_trace *bt = q->blk_trace;
        struct blk_io_trace_remap r;
@@ -869,7 +883,8 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
  *     Add a trace for that action.
  *
  **/
-static void blk_add_trace_rq_remap(struct request_queue *q,
+static void blk_add_trace_rq_remap(void *ignore,
+                                  struct request_queue *q,
                                   struct request *rq, dev_t dev,
                                   sector_t from)
 {
@@ -921,64 +936,64 @@ static void blk_register_tracepoints(void)
 {
        int ret;
 
-       ret = register_trace_block_rq_abort(blk_add_trace_rq_abort);
+       ret = register_trace_block_rq_abort(blk_add_trace_rq_abort, NULL);
        WARN_ON(ret);
-       ret = register_trace_block_rq_insert(blk_add_trace_rq_insert);
+       ret = register_trace_block_rq_insert(blk_add_trace_rq_insert, NULL);
        WARN_ON(ret);
-       ret = register_trace_block_rq_issue(blk_add_trace_rq_issue);
+       ret = register_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
        WARN_ON(ret);
-       ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue);
+       ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL);
        WARN_ON(ret);
-       ret = register_trace_block_rq_complete(blk_add_trace_rq_complete);
+       ret = register_trace_block_rq_complete(blk_add_trace_rq_complete, NULL);
        WARN_ON(ret);
-       ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce);
+       ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL);
        WARN_ON(ret);
-       ret = register_trace_block_bio_complete(blk_add_trace_bio_complete);
+       ret = register_trace_block_bio_complete(blk_add_trace_bio_complete, NULL);
        WARN_ON(ret);
-       ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge);
+       ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL);
        WARN_ON(ret);
-       ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge);
+       ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL);
        WARN_ON(ret);
-       ret = register_trace_block_bio_queue(blk_add_trace_bio_queue);
+       ret = register_trace_block_bio_queue(blk_add_trace_bio_queue, NULL);
        WARN_ON(ret);
-       ret = register_trace_block_getrq(blk_add_trace_getrq);
+       ret = register_trace_block_getrq(blk_add_trace_getrq, NULL);
        WARN_ON(ret);
-       ret = register_trace_block_sleeprq(blk_add_trace_sleeprq);
+       ret = register_trace_block_sleeprq(blk_add_trace_sleeprq, NULL);
        WARN_ON(ret);
-       ret = register_trace_block_plug(blk_add_trace_plug);
+       ret = register_trace_block_plug(blk_add_trace_plug, NULL);
        WARN_ON(ret);
-       ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer);
+       ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL);
        WARN_ON(ret);
-       ret = register_trace_block_unplug_io(blk_add_trace_unplug_io);
+       ret = register_trace_block_unplug_io(blk_add_trace_unplug_io, NULL);
        WARN_ON(ret);
-       ret = register_trace_block_split(blk_add_trace_split);
+       ret = register_trace_block_split(blk_add_trace_split, NULL);
        WARN_ON(ret);
-       ret = register_trace_block_remap(blk_add_trace_remap);
+       ret = register_trace_block_remap(blk_add_trace_remap, NULL);
        WARN_ON(ret);
-       ret = register_trace_block_rq_remap(blk_add_trace_rq_remap);
+       ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
        WARN_ON(ret);
 }
 
 static void blk_unregister_tracepoints(void)
 {
-       unregister_trace_block_rq_remap(blk_add_trace_rq_remap);
-       unregister_trace_block_remap(blk_add_trace_remap);
-       unregister_trace_block_split(blk_add_trace_split);
-       unregister_trace_block_unplug_io(blk_add_trace_unplug_io);
-       unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer);
-       unregister_trace_block_plug(blk_add_trace_plug);
-       unregister_trace_block_sleeprq(blk_add_trace_sleeprq);
-       unregister_trace_block_getrq(blk_add_trace_getrq);
-       unregister_trace_block_bio_queue(blk_add_trace_bio_queue);
-       unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge);
-       unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge);
-       unregister_trace_block_bio_complete(blk_add_trace_bio_complete);
-       unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce);
-       unregister_trace_block_rq_complete(blk_add_trace_rq_complete);
-       unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue);
-       unregister_trace_block_rq_issue(blk_add_trace_rq_issue);
-       unregister_trace_block_rq_insert(blk_add_trace_rq_insert);
-       unregister_trace_block_rq_abort(blk_add_trace_rq_abort);
+       unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
+       unregister_trace_block_remap(blk_add_trace_remap, NULL);
+       unregister_trace_block_split(blk_add_trace_split, NULL);
+       unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL);
+       unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL);
+       unregister_trace_block_plug(blk_add_trace_plug, NULL);
+       unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL);
+       unregister_trace_block_getrq(blk_add_trace_getrq, NULL);
+       unregister_trace_block_bio_queue(blk_add_trace_bio_queue, NULL);
+       unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL);
+       unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL);
+       unregister_trace_block_bio_complete(blk_add_trace_bio_complete, NULL);
+       unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL);
+       unregister_trace_block_rq_complete(blk_add_trace_rq_complete, NULL);
+       unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL);
+       unregister_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
+       unregister_trace_block_rq_insert(blk_add_trace_rq_insert, NULL);
+       unregister_trace_block_rq_abort(blk_add_trace_rq_abort, NULL);
 
        tracepoint_synchronize_unregister();
 }
@@ -1321,7 +1336,7 @@ out:
 }
 
 static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
-                                              int flags)
+                                              int flags, struct trace_event *event)
 {
        return print_one_line(iter, false);
 }
@@ -1343,7 +1358,8 @@ static int blk_trace_synthesize_old_trace(struct trace_iterator *iter)
 }
 
 static enum print_line_t
-blk_trace_event_print_binary(struct trace_iterator *iter, int flags)
+blk_trace_event_print_binary(struct trace_iterator *iter, int flags,
+                            struct trace_event *event)
 {
        return blk_trace_synthesize_old_trace(iter) ?
                        TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
@@ -1381,12 +1397,16 @@ static struct tracer blk_tracer __read_mostly = {
        .set_flag       = blk_tracer_set_flag,
 };
 
-static struct trace_event trace_blk_event = {
-       .type           = TRACE_BLK,
+static struct trace_event_functions trace_blk_event_funcs = {
        .trace          = blk_trace_event_print,
        .binary         = blk_trace_event_print_binary,
 };
 
+static struct trace_event trace_blk_event = {
+       .type           = TRACE_BLK,
+       .funcs          = &trace_blk_event_funcs,
+};
+
 static int __init init_blk_tracer(void)
 {
        if (!register_ftrace_event(&trace_blk_event)) {
index 2404b59b3097e09f1db4c15a1a8cf5240dbe4c51..6d2cb14f9449083c9a2e78f507b9c1255c8e7ca2 100644 (file)
@@ -264,6 +264,7 @@ struct ftrace_profile {
        unsigned long                   counter;
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
        unsigned long long              time;
+       unsigned long long              time_squared;
 #endif
 };
 
@@ -366,9 +367,9 @@ static int function_stat_headers(struct seq_file *m)
 {
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
        seq_printf(m, "  Function                               "
-                  "Hit    Time            Avg\n"
+                  "Hit    Time            Avg             s^2\n"
                      "  --------                               "
-                  "---    ----            ---\n");
+                  "---    ----            ---             ---\n");
 #else
        seq_printf(m, "  Function                               Hit\n"
                      "  --------                               ---\n");
@@ -384,6 +385,7 @@ static int function_stat_show(struct seq_file *m, void *v)
        static DEFINE_MUTEX(mutex);
        static struct trace_seq s;
        unsigned long long avg;
+       unsigned long long stddev;
 #endif
 
        kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
@@ -394,11 +396,25 @@ static int function_stat_show(struct seq_file *m, void *v)
        avg = rec->time;
        do_div(avg, rec->counter);
 
+       /* Sample standard deviation (s^2) */
+       if (rec->counter <= 1)
+               stddev = 0;
+       else {
+               stddev = rec->time_squared - rec->counter * avg * avg;
+               /*
+                * Divide only 1000 for ns^2 -> us^2 conversion.
+                * trace_print_graph_duration will divide 1000 again.
+                */
+               do_div(stddev, (rec->counter - 1) * 1000);
+       }
+
        mutex_lock(&mutex);
        trace_seq_init(&s);
        trace_print_graph_duration(rec->time, &s);
        trace_seq_puts(&s, "    ");
        trace_print_graph_duration(avg, &s);
+       trace_seq_puts(&s, "    ");
+       trace_print_graph_duration(stddev, &s);
        trace_print_seq(m, &s);
        mutex_unlock(&mutex);
 #endif
@@ -650,6 +666,10 @@ static void profile_graph_return(struct ftrace_graph_ret *trace)
        if (!stat->hash || !ftrace_profile_enabled)
                goto out;
 
+       /* If the calltime was zero'd ignore it */
+       if (!trace->calltime)
+               goto out;
+
        calltime = trace->rettime - trace->calltime;
 
        if (!(trace_flags & TRACE_ITER_GRAPH_TIME)) {
@@ -668,8 +688,10 @@ static void profile_graph_return(struct ftrace_graph_ret *trace)
        }
 
        rec = ftrace_find_profiled_func(stat, trace->func);
-       if (rec)
+       if (rec) {
                rec->time += calltime;
+               rec->time_squared += calltime * calltime;
+       }
 
  out:
        local_irq_restore(flags);
@@ -3212,8 +3234,8 @@ free:
 }
 
 static void
-ftrace_graph_probe_sched_switch(struct rq *__rq, struct task_struct *prev,
-                               struct task_struct *next)
+ftrace_graph_probe_sched_switch(void *ignore,
+                       struct task_struct *prev, struct task_struct *next)
 {
        unsigned long long timestamp;
        int index;
@@ -3267,7 +3289,7 @@ static int start_graph_tracing(void)
        } while (ret == -EAGAIN);
 
        if (!ret) {
-               ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch);
+               ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
                if (ret)
                        pr_info("ftrace_graph: Couldn't activate tracepoint"
                                " probe to kernel_sched_switch\n");
@@ -3339,11 +3361,11 @@ void unregister_ftrace_graph(void)
                goto out;
 
        ftrace_graph_active--;
-       unregister_trace_sched_switch(ftrace_graph_probe_sched_switch);
        ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
        ftrace_graph_entry = ftrace_graph_entry_stub;
        ftrace_shutdown(FTRACE_STOP_FUNC_RET);
        unregister_pm_notifier(&ftrace_suspend_notifier);
+       unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
 
  out:
        mutex_unlock(&ftrace_lock);
index a91da69f153ad0c859997356d53db6548006db7c..bbfc1bb1660b248758c85e2073999d1b5cce258f 100644 (file)
@@ -95,7 +95,8 @@ static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
        trace_wake_up();
 }
 
-static void kmemtrace_kmalloc(unsigned long call_site,
+static void kmemtrace_kmalloc(void *ignore,
+                             unsigned long call_site,
                              const void *ptr,
                              size_t bytes_req,
                              size_t bytes_alloc,
@@ -105,7 +106,8 @@ static void kmemtrace_kmalloc(unsigned long call_site,
                        bytes_req, bytes_alloc, gfp_flags, -1);
 }
 
-static void kmemtrace_kmem_cache_alloc(unsigned long call_site,
+static void kmemtrace_kmem_cache_alloc(void *ignore,
+                                      unsigned long call_site,
                                       const void *ptr,
                                       size_t bytes_req,
                                       size_t bytes_alloc,
@@ -115,7 +117,8 @@ static void kmemtrace_kmem_cache_alloc(unsigned long call_site,
                        bytes_req, bytes_alloc, gfp_flags, -1);
 }
 
-static void kmemtrace_kmalloc_node(unsigned long call_site,
+static void kmemtrace_kmalloc_node(void *ignore,
+                                  unsigned long call_site,
                                   const void *ptr,
                                   size_t bytes_req,
                                   size_t bytes_alloc,
@@ -126,7 +129,8 @@ static void kmemtrace_kmalloc_node(unsigned long call_site,
                        bytes_req, bytes_alloc, gfp_flags, node);
 }
 
-static void kmemtrace_kmem_cache_alloc_node(unsigned long call_site,
+static void kmemtrace_kmem_cache_alloc_node(void *ignore,
+                                           unsigned long call_site,
                                            const void *ptr,
                                            size_t bytes_req,
                                            size_t bytes_alloc,
@@ -137,12 +141,14 @@ static void kmemtrace_kmem_cache_alloc_node(unsigned long call_site,
                        bytes_req, bytes_alloc, gfp_flags, node);
 }
 
-static void kmemtrace_kfree(unsigned long call_site, const void *ptr)
+static void
+kmemtrace_kfree(void *ignore, unsigned long call_site, const void *ptr)
 {
        kmemtrace_free(KMEMTRACE_TYPE_KMALLOC, call_site, ptr);
 }
 
-static void kmemtrace_kmem_cache_free(unsigned long call_site, const void *ptr)
+static void kmemtrace_kmem_cache_free(void *ignore,
+                                     unsigned long call_site, const void *ptr)
 {
        kmemtrace_free(KMEMTRACE_TYPE_CACHE, call_site, ptr);
 }
@@ -151,34 +157,34 @@ static int kmemtrace_start_probes(void)
 {
        int err;
 
-       err = register_trace_kmalloc(kmemtrace_kmalloc);
+       err = register_trace_kmalloc(kmemtrace_kmalloc, NULL);
        if (err)
                return err;
-       err = register_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc);
+       err = register_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc, NULL);
        if (err)
                return err;
-       err = register_trace_kmalloc_node(kmemtrace_kmalloc_node);
+       err = register_trace_kmalloc_node(kmemtrace_kmalloc_node, NULL);
        if (err)
                return err;
-       err = register_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node);
+       err = register_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node, NULL);
        if (err)
                return err;
-       err = register_trace_kfree(kmemtrace_kfree);
+       err = register_trace_kfree(kmemtrace_kfree, NULL);
        if (err)
                return err;
-       err = register_trace_kmem_cache_free(kmemtrace_kmem_cache_free);
+       err = register_trace_kmem_cache_free(kmemtrace_kmem_cache_free, NULL);
 
        return err;
 }
 
 static void kmemtrace_stop_probes(void)
 {
-       unregister_trace_kmalloc(kmemtrace_kmalloc);
-       unregister_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc);
-       unregister_trace_kmalloc_node(kmemtrace_kmalloc_node);
-       unregister_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node);
-       unregister_trace_kfree(kmemtrace_kfree);
-       unregister_trace_kmem_cache_free(kmemtrace_kmem_cache_free);
+       unregister_trace_kmalloc(kmemtrace_kmalloc, NULL);
+       unregister_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc, NULL);
+       unregister_trace_kmalloc_node(kmemtrace_kmalloc_node, NULL);
+       unregister_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node, NULL);
+       unregister_trace_kfree(kmemtrace_kfree, NULL);
+       unregister_trace_kmem_cache_free(kmemtrace_kmem_cache_free, NULL);
 }
 
 static int kmem_trace_init(struct trace_array *tr)
@@ -237,7 +243,8 @@ struct kmemtrace_user_event_alloc {
 };
 
 static enum print_line_t
-kmemtrace_print_alloc(struct trace_iterator *iter, int flags)
+kmemtrace_print_alloc(struct trace_iterator *iter, int flags,
+                     struct trace_event *event)
 {
        struct trace_seq *s = &iter->seq;
        struct kmemtrace_alloc_entry *entry;
@@ -257,7 +264,8 @@ kmemtrace_print_alloc(struct trace_iterator *iter, int flags)
 }
 
 static enum print_line_t
-kmemtrace_print_free(struct trace_iterator *iter, int flags)
+kmemtrace_print_free(struct trace_iterator *iter, int flags,
+                    struct trace_event *event)
 {
        struct trace_seq *s = &iter->seq;
        struct kmemtrace_free_entry *entry;
@@ -275,7 +283,8 @@ kmemtrace_print_free(struct trace_iterator *iter, int flags)
 }
 
 static enum print_line_t
-kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags)
+kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags,
+                          struct trace_event *event)
 {
        struct trace_seq *s = &iter->seq;
        struct kmemtrace_alloc_entry *entry;
@@ -309,7 +318,8 @@ kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags)
 }
 
 static enum print_line_t
-kmemtrace_print_free_user(struct trace_iterator *iter, int flags)
+kmemtrace_print_free_user(struct trace_iterator *iter, int flags,
+                         struct trace_event *event)
 {
        struct trace_seq *s = &iter->seq;
        struct kmemtrace_free_entry *entry;
@@ -463,18 +473,26 @@ static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
        }
 }
 
-static struct trace_event kmem_trace_alloc = {
-       .type                   = TRACE_KMEM_ALLOC,
+static struct trace_event_functions kmem_trace_alloc_funcs = {
        .trace                  = kmemtrace_print_alloc,
        .binary                 = kmemtrace_print_alloc_user,
 };
 
-static struct trace_event kmem_trace_free = {
-       .type                   = TRACE_KMEM_FREE,
+static struct trace_event kmem_trace_alloc = {
+       .type                   = TRACE_KMEM_ALLOC,
+       .funcs                  = &kmem_trace_alloc_funcs,
+};
+
+static struct trace_event_functions kmem_trace_free_funcs = {
        .trace                  = kmemtrace_print_free,
        .binary                 = kmemtrace_print_free_user,
 };
 
+static struct trace_event kmem_trace_free = {
+       .type                   = TRACE_KMEM_FREE,
+       .funcs                  = &kmem_trace_free_funcs,
+};
+
 static struct tracer kmem_tracer __read_mostly = {
        .name                   = "kmemtrace",
        .init                   = kmem_trace_init,
index 41ca394feb22f4e920cbbf711e201b8e1f5ee903..7f6059c5aa94c772ba6ad9785c40838f96bbc77f 100644 (file)
@@ -319,6 +319,11 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
 #define TS_MASK                ((1ULL << TS_SHIFT) - 1)
 #define TS_DELTA_TEST  (~TS_MASK)
 
+/* Flag when events were overwritten */
+#define RB_MISSED_EVENTS       (1 << 31)
+/* Missed count stored at end */
+#define RB_MISSED_STORED       (1 << 30)
+
 struct buffer_data_page {
        u64              time_stamp;    /* page time stamp */
        local_t          commit;        /* write committed index */
@@ -338,6 +343,7 @@ struct buffer_page {
        local_t          write;         /* index for next write */
        unsigned         read;          /* index for next read */
        local_t          entries;       /* entries on this page */
+       unsigned long    real_end;      /* real end of data */
        struct buffer_data_page *page;  /* Actual data page */
 };
 
@@ -417,6 +423,12 @@ int ring_buffer_print_page_header(struct trace_seq *s)
                               (unsigned int)sizeof(field.commit),
                               (unsigned int)is_signed_type(long));
 
+       ret = trace_seq_printf(s, "\tfield: int overwrite;\t"
+                              "offset:%u;\tsize:%u;\tsigned:%u;\n",
+                              (unsigned int)offsetof(typeof(field), commit),
+                              1,
+                              (unsigned int)is_signed_type(long));
+
        ret = trace_seq_printf(s, "\tfield: char data;\t"
                               "offset:%u;\tsize:%u;\tsigned:%u;\n",
                               (unsigned int)offsetof(typeof(field), data),
@@ -440,6 +452,8 @@ struct ring_buffer_per_cpu {
        struct buffer_page              *tail_page;     /* write to tail */
        struct buffer_page              *commit_page;   /* committed pages */
        struct buffer_page              *reader_page;
+       unsigned long                   lost_events;
+       unsigned long                   last_overrun;
        local_t                         commit_overrun;
        local_t                         overrun;
        local_t                         entries;
@@ -1761,6 +1775,13 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
        event = __rb_page_index(tail_page, tail);
        kmemcheck_annotate_bitfield(event, bitfield);
 
+       /*
+        * Save the original length to the meta data.
+        * This will be used by the reader to add lost event
+        * counter.
+        */
+       tail_page->real_end = tail;
+
        /*
         * If this event is bigger than the minimum size, then
         * we need to be careful that we don't subtract the
@@ -1979,17 +2000,13 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
                  u64 *ts, u64 *delta)
 {
        struct ring_buffer_event *event;
-       static int once;
        int ret;
 
-       if (unlikely(*delta > (1ULL << 59) && !once++)) {
-               printk(KERN_WARNING "Delta way too big! %llu"
-                      " ts=%llu write stamp = %llu\n",
-                      (unsigned long long)*delta,
-                      (unsigned long long)*ts,
-                      (unsigned long long)cpu_buffer->write_stamp);
-               WARN_ON(1);
-       }
+       WARN_ONCE(*delta > (1ULL << 59),
+                 KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
+                 (unsigned long long)*delta,
+                 (unsigned long long)*ts,
+                 (unsigned long long)cpu_buffer->write_stamp);
 
        /*
         * The delta is too big, we to add a
@@ -2838,6 +2855,7 @@ static struct buffer_page *
 rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 {
        struct buffer_page *reader = NULL;
+       unsigned long overwrite;
        unsigned long flags;
        int nr_loops = 0;
        int ret;
@@ -2879,6 +2897,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
        local_set(&cpu_buffer->reader_page->write, 0);
        local_set(&cpu_buffer->reader_page->entries, 0);
        local_set(&cpu_buffer->reader_page->page->commit, 0);
+       cpu_buffer->reader_page->real_end = 0;
 
  spin:
        /*
@@ -2898,6 +2917,18 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
        /* The reader page will be pointing to the new head */
        rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list);
 
+       /*
+        * We want to make sure we read the overruns after we set up our
+        * pointers to the next object. The writer side does a
+        * cmpxchg to cross pages which acts as the mb on the writer
+        * side. Note, the reader will constantly fail the swap
+        * while the writer is updating the pointers, so this
+        * guarantees that the overwrite recorded here is the one we
+        * want to compare with the last_overrun.
+        */
+       smp_mb();
+       overwrite = local_read(&(cpu_buffer->overrun));
+
        /*
         * Here's the tricky part.
         *
@@ -2929,6 +2960,11 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
        cpu_buffer->reader_page = reader;
        rb_reset_reader_page(cpu_buffer);
 
+       if (overwrite != cpu_buffer->last_overrun) {
+               cpu_buffer->lost_events = overwrite - cpu_buffer->last_overrun;
+               cpu_buffer->last_overrun = overwrite;
+       }
+
        goto again;
 
  out:
@@ -3005,8 +3041,14 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
                rb_advance_iter(iter);
 }
 
+static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer)
+{
+       return cpu_buffer->lost_events;
+}
+
 static struct ring_buffer_event *
-rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts)
+rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
+              unsigned long *lost_events)
 {
        struct ring_buffer_event *event;
        struct buffer_page *reader;
@@ -3058,6 +3100,8 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts)
                        ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
                                                         cpu_buffer->cpu, ts);
                }
+               if (lost_events)
+                       *lost_events = rb_lost_events(cpu_buffer);
                return event;
 
        default:
@@ -3168,12 +3212,14 @@ static inline int rb_ok_to_lock(void)
  * @buffer: The ring buffer to read
  * @cpu: The cpu to peak at
  * @ts: The timestamp counter of this event.
+ * @lost_events: a variable to store if events were lost (may be NULL)
  *
  * This will return the event that will be read next, but does
  * not consume the data.
  */
 struct ring_buffer_event *
-ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
+ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts,
+                unsigned long *lost_events)
 {
        struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
        struct ring_buffer_event *event;
@@ -3188,7 +3234,7 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
        local_irq_save(flags);
        if (dolock)
                spin_lock(&cpu_buffer->reader_lock);
-       event = rb_buffer_peek(cpu_buffer, ts);
+       event = rb_buffer_peek(cpu_buffer, ts, lost_events);
        if (event && event->type_len == RINGBUF_TYPE_PADDING)
                rb_advance_reader(cpu_buffer);
        if (dolock)
@@ -3230,13 +3276,17 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 /**
  * ring_buffer_consume - return an event and consume it
  * @buffer: The ring buffer to get the next event from
+ * @cpu: the cpu to read the buffer from
+ * @ts: a variable to store the timestamp (may be NULL)
+ * @lost_events: a variable to store if events were lost (may be NULL)
  *
  * Returns the next event in the ring buffer, and that event is consumed.
  * Meaning, that sequential reads will keep returning a different event,
  * and eventually empty the ring buffer if the producer is slower.
  */
 struct ring_buffer_event *
-ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
+ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
+                   unsigned long *lost_events)
 {
        struct ring_buffer_per_cpu *cpu_buffer;
        struct ring_buffer_event *event = NULL;
@@ -3257,9 +3307,11 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
        if (dolock)
                spin_lock(&cpu_buffer->reader_lock);
 
-       event = rb_buffer_peek(cpu_buffer, ts);
-       if (event)
+       event = rb_buffer_peek(cpu_buffer, ts, lost_events);
+       if (event) {
+               cpu_buffer->lost_events = 0;
                rb_advance_reader(cpu_buffer);
+       }
 
        if (dolock)
                spin_unlock(&cpu_buffer->reader_lock);
@@ -3276,23 +3328,30 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
 EXPORT_SYMBOL_GPL(ring_buffer_consume);
 
 /**
- * ring_buffer_read_start - start a non consuming read of the buffer
+ * ring_buffer_read_prepare - Prepare for a non consuming read of the buffer
  * @buffer: The ring buffer to read from
  * @cpu: The cpu buffer to iterate over
  *
- * This starts up an iteration through the buffer. It also disables
- * the recording to the buffer until the reading is finished.
- * This prevents the reading from being corrupted. This is not
- * a consuming read, so a producer is not expected.
+ * This performs the initial preparations necessary to iterate
+ * through the buffer.  Memory is allocated, buffer recording
+ * is disabled, and the iterator pointer is returned to the caller.
  *
- * Must be paired with ring_buffer_finish.
+ * Disabling buffer recordng prevents the reading from being
+ * corrupted. This is not a consuming read, so a producer is not
+ * expected.
+ *
+ * After a sequence of ring_buffer_read_prepare calls, the user is
+ * expected to make at least one call to ring_buffer_prepare_sync.
+ * Afterwards, ring_buffer_read_start is invoked to get things going
+ * for real.
+ *
+ * This overall must be paired with ring_buffer_finish.
  */
 struct ring_buffer_iter *
-ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
+ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu)
 {
        struct ring_buffer_per_cpu *cpu_buffer;
        struct ring_buffer_iter *iter;
-       unsigned long flags;
 
        if (!cpumask_test_cpu(cpu, buffer->cpumask))
                return NULL;
@@ -3306,15 +3365,52 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
        iter->cpu_buffer = cpu_buffer;
 
        atomic_inc(&cpu_buffer->record_disabled);
+
+       return iter;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_read_prepare);
+
+/**
+ * ring_buffer_read_prepare_sync - Synchronize a set of prepare calls
+ *
+ * All previously invoked ring_buffer_read_prepare calls to prepare
+ * iterators will be synchronized.  Afterwards, read_buffer_read_start
+ * calls on those iterators are allowed.
+ */
+void
+ring_buffer_read_prepare_sync(void)
+{
        synchronize_sched();
+}
+EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync);
+
+/**
+ * ring_buffer_read_start - start a non consuming read of the buffer
+ * @iter: The iterator returned by ring_buffer_read_prepare
+ *
+ * This finalizes the startup of an iteration through the buffer.
+ * The iterator comes from a call to ring_buffer_read_prepare and
+ * an intervening ring_buffer_read_prepare_sync must have been
+ * performed.
+ *
+ * Must be paired with ring_buffer_finish.
+ */
+void
+ring_buffer_read_start(struct ring_buffer_iter *iter)
+{
+       struct ring_buffer_per_cpu *cpu_buffer;
+       unsigned long flags;
+
+       if (!iter)
+               return;
+
+       cpu_buffer = iter->cpu_buffer;
 
        spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
        arch_spin_lock(&cpu_buffer->lock);
        rb_iter_reset(iter);
        arch_spin_unlock(&cpu_buffer->lock);
        spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
-
-       return iter;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_read_start);
 
@@ -3408,6 +3504,9 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
        cpu_buffer->write_stamp = 0;
        cpu_buffer->read_stamp = 0;
 
+       cpu_buffer->lost_events = 0;
+       cpu_buffer->last_overrun = 0;
+
        rb_head_page_activate(cpu_buffer);
 }
 
@@ -3683,6 +3782,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
        struct ring_buffer_event *event;
        struct buffer_data_page *bpage;
        struct buffer_page *reader;
+       unsigned long missed_events;
        unsigned long flags;
        unsigned int commit;
        unsigned int read;
@@ -3719,6 +3819,9 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
        read = reader->read;
        commit = rb_page_commit(reader);
 
+       /* Check if any events were dropped */
+       missed_events = cpu_buffer->lost_events;
+
        /*
         * If this page has been partially read or
         * if len is not big enough to read the rest of the page or
@@ -3779,9 +3882,35 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
                local_set(&reader->entries, 0);
                reader->read = 0;
                *data_page = bpage;
+
+               /*
+                * Use the real_end for the data size,
+                * This gives us a chance to store the lost events
+                * on the page.
+                */
+               if (reader->real_end)
+                       local_set(&bpage->commit, reader->real_end);
        }
        ret = read;
 
+       cpu_buffer->lost_events = 0;
+       /*
+        * Set a flag in the commit field if we lost events
+        */
+       if (missed_events) {
+               commit = local_read(&bpage->commit);
+
+               /* If there is room at the end of the page to save the
+                * missed events, then record it there.
+                */
+               if (BUF_PAGE_SIZE - commit >= sizeof(missed_events)) {
+                       memcpy(&bpage->data[commit], &missed_events,
+                              sizeof(missed_events));
+                       local_add(RB_MISSED_STORED, &bpage->commit);
+               }
+               local_add(RB_MISSED_EVENTS, &bpage->commit);
+       }
+
  out_unlock:
        spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
index df74c7982255ba1c5095371e1ca3fd5369d3f3c2..302f8a6146352a998f2730923598346196e84856 100644 (file)
@@ -81,7 +81,7 @@ static enum event_status read_event(int cpu)
        int *entry;
        u64 ts;
 
-       event = ring_buffer_consume(buffer, cpu, &ts);
+       event = ring_buffer_consume(buffer, cpu, &ts, NULL);
        if (!event)
                return EVENT_DROPPED;
 
@@ -113,7 +113,8 @@ static enum event_status read_page(int cpu)
        ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1);
        if (ret >= 0) {
                rpage = bpage;
-               commit = local_read(&rpage->commit);
+               /* The commit may have missed event flags set, clear them */
+               commit = local_read(&rpage->commit) & 0xfffff;
                for (i = 0; i < commit && !kill_test; i += inc) {
 
                        if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) {
index 44f916a04065d9025f82e5e6228f11b0629df3dd..ba0ec81158b268fc1d0d0fd1314a04d796d838e1 100644 (file)
@@ -117,9 +117,12 @@ static cpumask_var_t __read_mostly tracing_buffer_mask;
  *
  * It is default off, but you can enable it with either specifying
  * "ftrace_dump_on_oops" in the kernel command line, or setting
- * /proc/sys/kernel/ftrace_dump_on_oops to true.
+ * /proc/sys/kernel/ftrace_dump_on_oops
+ * Set 1 if you want to dump buffers of all CPUs
+ * Set 2 if you want to dump the buffer of the CPU that triggered oops
  */
-int ftrace_dump_on_oops;
+
+enum ftrace_dump_mode ftrace_dump_on_oops;
 
 static int tracing_set_tracer(const char *buf);
 
@@ -139,8 +142,17 @@ __setup("ftrace=", set_cmdline_ftrace);
 
 static int __init set_ftrace_dump_on_oops(char *str)
 {
-       ftrace_dump_on_oops = 1;
-       return 1;
+       if (*str++ != '=' || !*str) {
+               ftrace_dump_on_oops = DUMP_ALL;
+               return 1;
+       }
+
+       if (!strcmp("orig_cpu", str)) {
+               ftrace_dump_on_oops = DUMP_ORIG;
+                return 1;
+        }
+
+        return 0;
 }
 __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
 
@@ -1545,7 +1557,8 @@ static void trace_iterator_increment(struct trace_iterator *iter)
 }
 
 static struct trace_entry *
-peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts)
+peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,
+               unsigned long *lost_events)
 {
        struct ring_buffer_event *event;
        struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu];
@@ -1556,7 +1569,8 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts)
        if (buf_iter)
                event = ring_buffer_iter_peek(buf_iter, ts);
        else
-               event = ring_buffer_peek(iter->tr->buffer, cpu, ts);
+               event = ring_buffer_peek(iter->tr->buffer, cpu, ts,
+                                        lost_events);
 
        ftrace_enable_cpu();
 
@@ -1564,10 +1578,12 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts)
 }
 
 static struct trace_entry *
-__find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
+__find_next_entry(struct trace_iterator *iter, int *ent_cpu,
+                 unsigned long *missing_events, u64 *ent_ts)
 {
        struct ring_buffer *buffer = iter->tr->buffer;
        struct trace_entry *ent, *next = NULL;
+       unsigned long lost_events = 0, next_lost = 0;
        int cpu_file = iter->cpu_file;
        u64 next_ts = 0, ts;
        int next_cpu = -1;
@@ -1580,7 +1596,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
        if (cpu_file > TRACE_PIPE_ALL_CPU) {
                if (ring_buffer_empty_cpu(buffer, cpu_file))
                        return NULL;
-               ent = peek_next_entry(iter, cpu_file, ent_ts);
+               ent = peek_next_entry(iter, cpu_file, ent_ts, missing_events);
                if (ent_cpu)
                        *ent_cpu = cpu_file;
 
@@ -1592,7 +1608,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
                if (ring_buffer_empty_cpu(buffer, cpu))
                        continue;
 
-               ent = peek_next_entry(iter, cpu, &ts);
+               ent = peek_next_entry(iter, cpu, &ts, &lost_events);
 
                /*
                 * Pick the entry with the smallest timestamp:
@@ -1601,6 +1617,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
                        next = ent;
                        next_cpu = cpu;
                        next_ts = ts;
+                       next_lost = lost_events;
                }
        }
 
@@ -1610,6 +1627,9 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
        if (ent_ts)
                *ent_ts = next_ts;
 
+       if (missing_events)
+               *missing_events = next_lost;
+
        return next;
 }
 
@@ -1617,13 +1637,14 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
 struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
                                          int *ent_cpu, u64 *ent_ts)
 {
-       return __find_next_entry(iter, ent_cpu, ent_ts);
+       return __find_next_entry(iter, ent_cpu, NULL, ent_ts);
 }
 
 /* Find the next real entry, and increment the iterator to the next entry */
 static void *find_next_entry_inc(struct trace_iterator *iter)
 {
-       iter->ent = __find_next_entry(iter, &iter->cpu, &iter->ts);
+       iter->ent = __find_next_entry(iter, &iter->cpu,
+                                     &iter->lost_events, &iter->ts);
 
        if (iter->ent)
                trace_iterator_increment(iter);
@@ -1635,7 +1656,8 @@ static void trace_consume(struct trace_iterator *iter)
 {
        /* Don't allow ftrace to trace into the ring buffers */
        ftrace_disable_cpu();
-       ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts);
+       ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts,
+                           &iter->lost_events);
        ftrace_enable_cpu();
 }
 
@@ -1786,7 +1808,7 @@ static void print_func_help_header(struct seq_file *m)
 }
 
 
-static void
+void
 print_trace_header(struct seq_file *m, struct trace_iterator *iter)
 {
        unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
@@ -1914,7 +1936,7 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
        }
 
        if (event)
-               return event->trace(iter, sym_flags);
+               return event->funcs->trace(iter, sym_flags, event);
 
        if (!trace_seq_printf(s, "Unknown type %d\n", entry->type))
                goto partial;
@@ -1940,7 +1962,7 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
 
        event = ftrace_find_event(entry->type);
        if (event)
-               return event->raw(iter, 0);
+               return event->funcs->raw(iter, 0, event);
 
        if (!trace_seq_printf(s, "%d ?\n", entry->type))
                goto partial;
@@ -1967,7 +1989,7 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
 
        event = ftrace_find_event(entry->type);
        if (event) {
-               enum print_line_t ret = event->hex(iter, 0);
+               enum print_line_t ret = event->funcs->hex(iter, 0, event);
                if (ret != TRACE_TYPE_HANDLED)
                        return ret;
        }
@@ -1992,10 +2014,11 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
        }
 
        event = ftrace_find_event(entry->type);
-       return event ? event->binary(iter, 0) : TRACE_TYPE_HANDLED;
+       return event ? event->funcs->binary(iter, 0, event) :
+               TRACE_TYPE_HANDLED;
 }
 
-static int trace_empty(struct trace_iterator *iter)
+int trace_empty(struct trace_iterator *iter)
 {
        int cpu;
 
@@ -2030,6 +2053,10 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
 {
        enum print_line_t ret;
 
+       if (iter->lost_events)
+               trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n",
+                                iter->cpu, iter->lost_events);
+
        if (iter->trace && iter->trace->print_line) {
                ret = iter->trace->print_line(iter);
                if (ret != TRACE_TYPE_UNHANDLED)
@@ -2058,6 +2085,23 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
        return print_trace_fmt(iter);
 }
 
+void trace_default_header(struct seq_file *m)
+{
+       struct trace_iterator *iter = m->private;
+
+       if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
+               /* print nothing if the buffers are empty */
+               if (trace_empty(iter))
+                       return;
+               print_trace_header(m, iter);
+               if (!(trace_flags & TRACE_ITER_VERBOSE))
+                       print_lat_help_header(m);
+       } else {
+               if (!(trace_flags & TRACE_ITER_VERBOSE))
+                       print_func_help_header(m);
+       }
+}
+
 static int s_show(struct seq_file *m, void *v)
 {
        struct trace_iterator *iter = v;
@@ -2070,17 +2114,9 @@ static int s_show(struct seq_file *m, void *v)
                }
                if (iter->trace && iter->trace->print_header)
                        iter->trace->print_header(m);
-               else if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
-                       /* print nothing if the buffers are empty */
-                       if (trace_empty(iter))
-                               return 0;
-                       print_trace_header(m, iter);
-                       if (!(trace_flags & TRACE_ITER_VERBOSE))
-                               print_lat_help_header(m);
-               } else {
-                       if (!(trace_flags & TRACE_ITER_VERBOSE))
-                               print_func_help_header(m);
-               }
+               else
+                       trace_default_header(m);
+
        } else if (iter->leftover) {
                /*
                 * If we filled the seq_file buffer earlier, we
@@ -2166,15 +2202,20 @@ __tracing_open(struct inode *inode, struct file *file)
 
        if (iter->cpu_file == TRACE_PIPE_ALL_CPU) {
                for_each_tracing_cpu(cpu) {
-
                        iter->buffer_iter[cpu] =
-                               ring_buffer_read_start(iter->tr->buffer, cpu);
+                               ring_buffer_read_prepare(iter->tr->buffer, cpu);
+               }
+               ring_buffer_read_prepare_sync();
+               for_each_tracing_cpu(cpu) {
+                       ring_buffer_read_start(iter->buffer_iter[cpu]);
                        tracing_iter_reset(iter, cpu);
                }
        } else {
                cpu = iter->cpu_file;
                iter->buffer_iter[cpu] =
-                               ring_buffer_read_start(iter->tr->buffer, cpu);
+                       ring_buffer_read_prepare(iter->tr->buffer, cpu);
+               ring_buffer_read_prepare_sync();
+               ring_buffer_read_start(iter->buffer_iter[cpu]);
                tracing_iter_reset(iter, cpu);
        }
 
@@ -4324,7 +4365,7 @@ static int trace_panic_handler(struct notifier_block *this,
                               unsigned long event, void *unused)
 {
        if (ftrace_dump_on_oops)
-               ftrace_dump();
+               ftrace_dump(ftrace_dump_on_oops);
        return NOTIFY_OK;
 }
 
@@ -4341,7 +4382,7 @@ static int trace_die_handler(struct notifier_block *self,
        switch (val) {
        case DIE_OOPS:
                if (ftrace_dump_on_oops)
-                       ftrace_dump();
+                       ftrace_dump(ftrace_dump_on_oops);
                break;
        default:
                break;
@@ -4382,7 +4423,8 @@ trace_printk_seq(struct trace_seq *s)
        trace_seq_init(s);
 }
 
-static void __ftrace_dump(bool disable_tracing)
+static void
+__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
 {
        static arch_spinlock_t ftrace_dump_lock =
                (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
@@ -4415,12 +4457,25 @@ static void __ftrace_dump(bool disable_tracing)
        /* don't look at user memory in panic mode */
        trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
 
-       printk(KERN_TRACE "Dumping ftrace buffer:\n");
-
        /* Simulate the iterator */
        iter.tr = &global_trace;
        iter.trace = current_trace;
-       iter.cpu_file = TRACE_PIPE_ALL_CPU;
+
+       switch (oops_dump_mode) {
+       case DUMP_ALL:
+               iter.cpu_file = TRACE_PIPE_ALL_CPU;
+               break;
+       case DUMP_ORIG:
+               iter.cpu_file = raw_smp_processor_id();
+               break;
+       case DUMP_NONE:
+               goto out_enable;
+       default:
+               printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n");
+               iter.cpu_file = TRACE_PIPE_ALL_CPU;
+       }
+
+       printk(KERN_TRACE "Dumping ftrace buffer:\n");
 
        /*
         * We need to stop all tracing on all CPUS to read the
@@ -4459,6 +4514,7 @@ static void __ftrace_dump(bool disable_tracing)
        else
                printk(KERN_TRACE "---------------------------------\n");
 
+ out_enable:
        /* Re-enable tracing if requested */
        if (!disable_tracing) {
                trace_flags |= old_userobj;
@@ -4475,9 +4531,9 @@ static void __ftrace_dump(bool disable_tracing)
 }
 
 /* By default: disable tracing after the dump */
-void ftrace_dump(void)
+void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
 {
-       __ftrace_dump(true);
+       __ftrace_dump(true, oops_dump_mode);
 }
 
 __init static int tracer_alloc_buffers(void)
index 3ebdb6bd2362a68e9121c1d0aeb5acb31229fa60..2cd96399463f88d51a5683802cc34f74343f881f 100644 (file)
@@ -364,6 +364,9 @@ void trace_function(struct trace_array *tr,
                    unsigned long ip,
                    unsigned long parent_ip,
                    unsigned long flags, int pc);
+void trace_default_header(struct seq_file *m);
+void print_trace_header(struct seq_file *m, struct trace_iterator *iter);
+int trace_empty(struct trace_iterator *iter);
 
 void trace_graph_return(struct ftrace_graph_ret *trace);
 int trace_graph_entry(struct ftrace_graph_ent *trace);
@@ -402,12 +405,12 @@ void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags,
 void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
                   int pc);
 #else
-static inline void ftrace_trace_stack(struct trace_array *tr,
+static inline void ftrace_trace_stack(struct ring_buffer *buffer,
                                      unsigned long flags, int skip, int pc)
 {
 }
 
-static inline void ftrace_trace_userstack(struct trace_array *tr,
+static inline void ftrace_trace_userstack(struct ring_buffer *buffer,
                                          unsigned long flags, int pc)
 {
 }
@@ -475,9 +478,29 @@ extern int trace_clock_id;
 
 /* Standard output formatting function used for function return traces */
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
-extern enum print_line_t print_graph_function(struct trace_iterator *iter);
+
+/* Flag options */
+#define TRACE_GRAPH_PRINT_OVERRUN       0x1
+#define TRACE_GRAPH_PRINT_CPU           0x2
+#define TRACE_GRAPH_PRINT_OVERHEAD      0x4
+#define TRACE_GRAPH_PRINT_PROC          0x8
+#define TRACE_GRAPH_PRINT_DURATION      0x10
+#define TRACE_GRAPH_PRINT_ABS_TIME      0x20
+
+extern enum print_line_t
+print_graph_function_flags(struct trace_iterator *iter, u32 flags);
+extern void print_graph_headers_flags(struct seq_file *s, u32 flags);
 extern enum print_line_t
 trace_print_graph_duration(unsigned long long duration, struct trace_seq *s);
+extern void graph_trace_open(struct trace_iterator *iter);
+extern void graph_trace_close(struct trace_iterator *iter);
+extern int __trace_graph_entry(struct trace_array *tr,
+                              struct ftrace_graph_ent *trace,
+                              unsigned long flags, int pc);
+extern void __trace_graph_return(struct trace_array *tr,
+                                struct ftrace_graph_ret *trace,
+                                unsigned long flags, int pc);
+
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 /* TODO: make this variable */
@@ -508,7 +531,7 @@ static inline int ftrace_graph_addr(unsigned long addr)
 #endif /* CONFIG_DYNAMIC_FTRACE */
 #else /* CONFIG_FUNCTION_GRAPH_TRACER */
 static inline enum print_line_t
-print_graph_function(struct trace_iterator *iter)
+print_graph_function_flags(struct trace_iterator *iter, u32 flags)
 {
        return TRACE_TYPE_UNHANDLED;
 }
@@ -755,12 +778,15 @@ extern void print_subsystem_event_filter(struct event_subsystem *system,
                                         struct trace_seq *s);
 extern int filter_assign_type(const char *type);
 
+struct list_head *
+trace_get_fields(struct ftrace_event_call *event_call);
+
 static inline int
 filter_check_discard(struct ftrace_event_call *call, void *rec,
                     struct ring_buffer *buffer,
                     struct ring_buffer_event *event)
 {
-       if (unlikely(call->filter_active) &&
+       if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) &&
            !filter_match_preds(call->filter, rec)) {
                ring_buffer_discard_commit(buffer, event);
                return 1;
index b9bc4d47017724a5dc278b910f1b2c4fff443741..8d3538b4ea5f0c93be082c17d92f0ee1787b2fda 100644 (file)
@@ -143,7 +143,7 @@ static void branch_trace_reset(struct trace_array *tr)
 }
 
 static enum print_line_t trace_branch_print(struct trace_iterator *iter,
-                                           int flags)
+                                           int flags, struct trace_event *event)
 {
        struct trace_branch *field;
 
@@ -167,9 +167,13 @@ static void branch_print_header(struct seq_file *s)
                "    |\n");
 }
 
+static struct trace_event_functions trace_branch_funcs = {
+       .trace          = trace_branch_print,
+};
+
 static struct trace_event trace_branch_event = {
        .type           = TRACE_BRANCH,
-       .trace          = trace_branch_print,
+       .funcs          = &trace_branch_funcs,
 };
 
 static struct tracer branch_trace __read_mostly =
index 0565bb42566f6982d6d197857e0b7e07511ab8a7..0a47e8d6b4914e555ece3abec80a45db64f3c3d3 100644 (file)
@@ -49,7 +49,12 @@ static int perf_trace_event_enable(struct ftrace_event_call *event)
                rcu_assign_pointer(perf_trace_buf_nmi, buf);
        }
 
-       ret = event->perf_event_enable(event);
+       if (event->class->reg)
+               ret = event->class->reg(event, TRACE_REG_PERF_REGISTER);
+       else
+               ret = tracepoint_probe_register(event->name,
+                                               event->class->perf_probe,
+                                               event);
        if (!ret) {
                total_ref_count++;
                return 0;
@@ -75,7 +80,8 @@ int perf_trace_enable(int event_id)
 
        mutex_lock(&event_mutex);
        list_for_each_entry(event, &ftrace_events, list) {
-               if (event->id == event_id && event->perf_event_enable &&
+               if (event->event.type == event_id &&
+                   event->class && event->class->perf_probe &&
                    try_module_get(event->mod)) {
                        ret = perf_trace_event_enable(event);
                        break;
@@ -93,7 +99,10 @@ static void perf_trace_event_disable(struct ftrace_event_call *event)
        if (--event->perf_refcount > 0)
                return;
 
-       event->perf_event_disable(event);
+       if (event->class->reg)
+               event->class->reg(event, TRACE_REG_PERF_UNREGISTER);
+       else
+               tracepoint_probe_unregister(event->name, event->class->perf_probe, event);
 
        if (!--total_ref_count) {
                buf = perf_trace_buf;
@@ -119,7 +128,7 @@ void perf_trace_disable(int event_id)
 
        mutex_lock(&event_mutex);
        list_for_each_entry(event, &ftrace_events, list) {
-               if (event->id == event_id) {
+               if (event->event.type == event_id) {
                        perf_trace_event_disable(event);
                        module_put(event->mod);
                        break;
index c697c70433494d6e41e51656a767c9f575cdccba..53cffc0b08014db76d9e87da0d8bbc3c9768bd49 100644 (file)
@@ -29,11 +29,23 @@ DEFINE_MUTEX(event_mutex);
 
 LIST_HEAD(ftrace_events);
 
+struct list_head *
+trace_get_fields(struct ftrace_event_call *event_call)
+{
+       if (!event_call->class->get_fields)
+               return &event_call->class->fields;
+       return event_call->class->get_fields(event_call);
+}
+
 int trace_define_field(struct ftrace_event_call *call, const char *type,
                       const char *name, int offset, int size, int is_signed,
                       int filter_type)
 {
        struct ftrace_event_field *field;
+       struct list_head *head;
+
+       if (WARN_ON(!call->class))
+               return 0;
 
        field = kzalloc(sizeof(*field), GFP_KERNEL);
        if (!field)
@@ -56,7 +68,8 @@ int trace_define_field(struct ftrace_event_call *call, const char *type,
        field->size = size;
        field->is_signed = is_signed;
 
-       list_add(&field->link, &call->fields);
+       head = trace_get_fields(call);
+       list_add(&field->link, head);
 
        return 0;
 
@@ -94,8 +107,10 @@ static int trace_define_common_fields(struct ftrace_event_call *call)
 void trace_destroy_fields(struct ftrace_event_call *call)
 {
        struct ftrace_event_field *field, *next;
+       struct list_head *head;
 
-       list_for_each_entry_safe(field, next, &call->fields, link) {
+       head = trace_get_fields(call);
+       list_for_each_entry_safe(field, next, head, link) {
                list_del(&field->link);
                kfree(field->type);
                kfree(field->name);
@@ -107,11 +122,9 @@ int trace_event_raw_init(struct ftrace_event_call *call)
 {
        int id;
 
-       id = register_ftrace_event(call->event);
+       id = register_ftrace_event(&call->event);
        if (!id)
                return -ENODEV;
-       call->id = id;
-       INIT_LIST_HEAD(&call->fields);
 
        return 0;
 }
@@ -124,23 +137,33 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call,
 
        switch (enable) {
        case 0:
-               if (call->enabled) {
-                       call->enabled = 0;
+               if (call->flags & TRACE_EVENT_FL_ENABLED) {
+                       call->flags &= ~TRACE_EVENT_FL_ENABLED;
                        tracing_stop_cmdline_record();
-                       call->unregfunc(call);
+                       if (call->class->reg)
+                               call->class->reg(call, TRACE_REG_UNREGISTER);
+                       else
+                               tracepoint_probe_unregister(call->name,
+                                                           call->class->probe,
+                                                           call);
                }
                break;
        case 1:
-               if (!call->enabled) {
+               if (!(call->flags & TRACE_EVENT_FL_ENABLED)) {
                        tracing_start_cmdline_record();
-                       ret = call->regfunc(call);
+                       if (call->class->reg)
+                               ret = call->class->reg(call, TRACE_REG_REGISTER);
+                       else
+                               ret = tracepoint_probe_register(call->name,
+                                                               call->class->probe,
+                                                               call);
                        if (ret) {
                                tracing_stop_cmdline_record();
                                pr_info("event trace: Could not enable event "
                                        "%s\n", call->name);
                                break;
                        }
-                       call->enabled = 1;
+                       call->flags |= TRACE_EVENT_FL_ENABLED;
                }
                break;
        }
@@ -171,15 +194,16 @@ static int __ftrace_set_clr_event(const char *match, const char *sub,
        mutex_lock(&event_mutex);
        list_for_each_entry(call, &ftrace_events, list) {
 
-               if (!call->name || !call->regfunc)
+               if (!call->name || !call->class ||
+                   (!call->class->probe && !call->class->reg))
                        continue;
 
                if (match &&
                    strcmp(match, call->name) != 0 &&
-                   strcmp(match, call->system) != 0)
+                   strcmp(match, call->class->system) != 0)
                        continue;
 
-               if (sub && strcmp(sub, call->system) != 0)
+               if (sub && strcmp(sub, call->class->system) != 0)
                        continue;
 
                if (event && strcmp(event, call->name) != 0)
@@ -297,7 +321,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
                 * The ftrace subsystem is for showing formats only.
                 * They can not be enabled or disabled via the event files.
                 */
-               if (call->regfunc)
+               if (call->class && (call->class->probe || call->class->reg))
                        return call;
        }
 
@@ -328,7 +352,7 @@ s_next(struct seq_file *m, void *v, loff_t *pos)
        (*pos)++;
 
        list_for_each_entry_continue(call, &ftrace_events, list) {
-               if (call->enabled)
+               if (call->flags & TRACE_EVENT_FL_ENABLED)
                        return call;
        }
 
@@ -355,8 +379,8 @@ static int t_show(struct seq_file *m, void *v)
 {
        struct ftrace_event_call *call = v;
 
-       if (strcmp(call->system, TRACE_SYSTEM) != 0)
-               seq_printf(m, "%s:", call->system);
+       if (strcmp(call->class->system, TRACE_SYSTEM) != 0)
+               seq_printf(m, "%s:", call->class->system);
        seq_printf(m, "%s\n", call->name);
 
        return 0;
@@ -387,7 +411,7 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
        struct ftrace_event_call *call = filp->private_data;
        char *buf;
 
-       if (call->enabled)
+       if (call->flags & TRACE_EVENT_FL_ENABLED)
                buf = "1\n";
        else
                buf = "0\n";
@@ -450,10 +474,11 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
 
        mutex_lock(&event_mutex);
        list_for_each_entry(call, &ftrace_events, list) {
-               if (!call->name || !call->regfunc)
+               if (!call->name || !call->class ||
+                   (!call->class->probe && !call->class->reg))
                        continue;
 
-               if (system && strcmp(call->system, system) != 0)
+               if (system && strcmp(call->class->system, system) != 0)
                        continue;
 
                /*
@@ -461,7 +486,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
                 * or if all events or cleared, or if we have
                 * a mixture.
                 */
-               set |= (1 << !!call->enabled);
+               set |= (1 << !!(call->flags & TRACE_EVENT_FL_ENABLED));
 
                /*
                 * If we have a mixture, no need to look further.
@@ -525,6 +550,7 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
 {
        struct ftrace_event_call *call = filp->private_data;
        struct ftrace_event_field *field;
+       struct list_head *head;
        struct trace_seq *s;
        int common_field_count = 5;
        char *buf;
@@ -540,10 +566,11 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
        trace_seq_init(s);
 
        trace_seq_printf(s, "name: %s\n", call->name);
-       trace_seq_printf(s, "ID: %d\n", call->id);
+       trace_seq_printf(s, "ID: %d\n", call->event.type);
        trace_seq_printf(s, "format:\n");
 
-       list_for_each_entry_reverse(field, &call->fields, link) {
+       head = trace_get_fields(call);
+       list_for_each_entry_reverse(field, head, link) {
                /*
                 * Smartly shows the array type(except dynamic array).
                 * Normal:
@@ -613,7 +640,7 @@ event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
                return -ENOMEM;
 
        trace_seq_init(s);
-       trace_seq_printf(s, "%d\n", call->id);
+       trace_seq_printf(s, "%d\n", call->event.type);
 
        r = simple_read_from_buffer(ubuf, cnt, ppos,
                                    s->buffer, s->len);
@@ -919,14 +946,15 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
                 const struct file_operations *filter,
                 const struct file_operations *format)
 {
+       struct list_head *head;
        int ret;
 
        /*
         * If the trace point header did not define TRACE_SYSTEM
         * then the system would be called "TRACE_SYSTEM".
         */
-       if (strcmp(call->system, TRACE_SYSTEM) != 0)
-               d_events = event_subsystem_dir(call->system, d_events);
+       if (strcmp(call->class->system, TRACE_SYSTEM) != 0)
+               d_events = event_subsystem_dir(call->class->system, d_events);
 
        call->dir = debugfs_create_dir(call->name, d_events);
        if (!call->dir) {
@@ -935,22 +963,31 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
                return -1;
        }
 
-       if (call->regfunc)
+       if (call->class->probe || call->class->reg)
                trace_create_file("enable", 0644, call->dir, call,
                                  enable);
 
-       if (call->id && call->perf_event_enable)
+#ifdef CONFIG_PERF_EVENTS
+       if (call->event.type && (call->class->perf_probe || call->class->reg))
                trace_create_file("id", 0444, call->dir, call,
                                  id);
+#endif
 
-       if (call->define_fields) {
-               ret = trace_define_common_fields(call);
-               if (!ret)
-                       ret = call->define_fields(call);
-               if (ret < 0) {
-                       pr_warning("Could not initialize trace point"
-                                  " events/%s\n", call->name);
-                       return ret;
+       if (call->class->define_fields) {
+               /*
+                * Other events may have the same class. Only update
+                * the fields if they are not already defined.
+                */
+               head = trace_get_fields(call);
+               if (list_empty(head)) {
+                       ret = trace_define_common_fields(call);
+                       if (!ret)
+                               ret = call->class->define_fields(call);
+                       if (ret < 0) {
+                               pr_warning("Could not initialize trace point"
+                                          " events/%s\n", call->name);
+                               return ret;
+                       }
                }
                trace_create_file("filter", 0644, call->dir, call,
                                  filter);
@@ -970,8 +1007,8 @@ static int __trace_add_event_call(struct ftrace_event_call *call)
        if (!call->name)
                return -EINVAL;
 
-       if (call->raw_init) {
-               ret = call->raw_init(call);
+       if (call->class->raw_init) {
+               ret = call->class->raw_init(call);
                if (ret < 0) {
                        if (ret != -ENOSYS)
                                pr_warning("Could not initialize trace "
@@ -1035,13 +1072,13 @@ static void remove_subsystem_dir(const char *name)
 static void __trace_remove_event_call(struct ftrace_event_call *call)
 {
        ftrace_event_enable_disable(call, 0);
-       if (call->event)
-               __unregister_ftrace_event(call->event);
+       if (call->event.funcs)
+               __unregister_ftrace_event(&call->event);
        debugfs_remove_recursive(call->dir);
        list_del(&call->list);
        trace_destroy_fields(call);
        destroy_preds(call);
-       remove_subsystem_dir(call->system);
+       remove_subsystem_dir(call->class->system);
 }
 
 /* Remove an event_call */
@@ -1132,8 +1169,8 @@ static void trace_module_add_events(struct module *mod)
                /* The linker may leave blanks */
                if (!call->name)
                        continue;
-               if (call->raw_init) {
-                       ret = call->raw_init(call);
+               if (call->class->raw_init) {
+                       ret = call->class->raw_init(call);
                        if (ret < 0) {
                                if (ret != -ENOSYS)
                                        pr_warning("Could not initialize trace "
@@ -1286,8 +1323,8 @@ static __init int event_trace_init(void)
                /* The linker may leave blanks */
                if (!call->name)
                        continue;
-               if (call->raw_init) {
-                       ret = call->raw_init(call);
+               if (call->class->raw_init) {
+                       ret = call->class->raw_init(call);
                        if (ret < 0) {
                                if (ret != -ENOSYS)
                                        pr_warning("Could not initialize trace "
@@ -1388,8 +1425,8 @@ static __init void event_trace_self_tests(void)
 
        list_for_each_entry(call, &ftrace_events, list) {
 
-               /* Only test those that have a regfunc */
-               if (!call->regfunc)
+               /* Only test those that have a probe */
+               if (!call->class || !call->class->probe)
                        continue;
 
 /*
@@ -1399,8 +1436,8 @@ static __init void event_trace_self_tests(void)
  * syscalls as we test.
  */
 #ifndef CONFIG_EVENT_TRACE_TEST_SYSCALLS
-               if (call->system &&
-                   strcmp(call->system, "syscalls") == 0)
+               if (call->class->system &&
+                   strcmp(call->class->system, "syscalls") == 0)
                        continue;
 #endif
 
@@ -1410,7 +1447,7 @@ static __init void event_trace_self_tests(void)
                 * If an event is already enabled, someone is using
                 * it and the self test should not be on.
                 */
-               if (call->enabled) {
+               if (call->flags & TRACE_EVENT_FL_ENABLED) {
                        pr_warning("Enabled event during self test!\n");
                        WARN_ON_ONCE(1);
                        continue;
index 58092d844a1fce94b36ecb3763ccfc0fded5d1a4..57bb1bb329997f62bc5805e6a3af8d5ecf64479a 100644 (file)
@@ -500,8 +500,10 @@ static struct ftrace_event_field *
 find_event_field(struct ftrace_event_call *call, char *name)
 {
        struct ftrace_event_field *field;
+       struct list_head *head;
 
-       list_for_each_entry(field, &call->fields, link) {
+       head = trace_get_fields(call);
+       list_for_each_entry(field, head, link) {
                if (!strcmp(field->name, name))
                        return field;
        }
@@ -545,7 +547,7 @@ static void filter_disable_preds(struct ftrace_event_call *call)
        struct event_filter *filter = call->filter;
        int i;
 
-       call->filter_active = 0;
+       call->flags &= ~TRACE_EVENT_FL_FILTERED;
        filter->n_preds = 0;
 
        for (i = 0; i < MAX_FILTER_PRED; i++)
@@ -572,7 +574,7 @@ void destroy_preds(struct ftrace_event_call *call)
 {
        __free_preds(call->filter);
        call->filter = NULL;
-       call->filter_active = 0;
+       call->flags &= ~TRACE_EVENT_FL_FILTERED;
 }
 
 static struct event_filter *__alloc_preds(void)
@@ -611,7 +613,7 @@ static int init_preds(struct ftrace_event_call *call)
        if (call->filter)
                return 0;
 
-       call->filter_active = 0;
+       call->flags &= ~TRACE_EVENT_FL_FILTERED;
        call->filter = __alloc_preds();
        if (IS_ERR(call->filter))
                return PTR_ERR(call->filter);
@@ -625,10 +627,10 @@ static int init_subsystem_preds(struct event_subsystem *system)
        int err;
 
        list_for_each_entry(call, &ftrace_events, list) {
-               if (!call->define_fields)
+               if (!call->class || !call->class->define_fields)
                        continue;
 
-               if (strcmp(call->system, system->name) != 0)
+               if (strcmp(call->class->system, system->name) != 0)
                        continue;
 
                err = init_preds(call);
@@ -644,10 +646,10 @@ static void filter_free_subsystem_preds(struct event_subsystem *system)
        struct ftrace_event_call *call;
 
        list_for_each_entry(call, &ftrace_events, list) {
-               if (!call->define_fields)
+               if (!call->class || !call->class->define_fields)
                        continue;
 
-               if (strcmp(call->system, system->name) != 0)
+               if (strcmp(call->class->system, system->name) != 0)
                        continue;
 
                filter_disable_preds(call);
@@ -1249,10 +1251,10 @@ static int replace_system_preds(struct event_subsystem *system,
        list_for_each_entry(call, &ftrace_events, list) {
                struct event_filter *filter = call->filter;
 
-               if (!call->define_fields)
+               if (!call->class || !call->class->define_fields)
                        continue;
 
-               if (strcmp(call->system, system->name) != 0)
+               if (strcmp(call->class->system, system->name) != 0)
                        continue;
 
                /* try to see if the filter can be applied */
@@ -1266,7 +1268,7 @@ static int replace_system_preds(struct event_subsystem *system,
                if (err)
                        filter_disable_preds(call);
                else {
-                       call->filter_active = 1;
+                       call->flags |= TRACE_EVENT_FL_FILTERED;
                        replace_filter_string(filter, filter_string);
                }
                fail = false;
@@ -1315,7 +1317,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
        if (err)
                append_filter_err(ps, call->filter);
        else
-               call->filter_active = 1;
+               call->flags |= TRACE_EVENT_FL_FILTERED;
 out:
        filter_opstack_clear(ps);
        postfix_clear(ps);
@@ -1393,7 +1395,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
        mutex_lock(&event_mutex);
 
        list_for_each_entry(call, &ftrace_events, list) {
-               if (call->id == event_id)
+               if (call->event.type == event_id)
                        break;
        }
 
index e091f64ba6ce04dc6437276bbb2cadd4fe39de4f..8536e2a659690f5aa82935ec20eb7d8fa74e3b5f 100644 (file)
@@ -127,7 +127,7 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call)   \
 
 static int ftrace_raw_init_event(struct ftrace_event_call *call)
 {
-       INIT_LIST_HEAD(&call->fields);
+       INIT_LIST_HEAD(&call->class->fields);
        return 0;
 }
 
@@ -153,17 +153,21 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call)
 #define F_printk(fmt, args...) #fmt ", "  __stringify(args)
 
 #undef FTRACE_ENTRY
-#define FTRACE_ENTRY(call, struct_name, type, tstruct, print)          \
+#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print)         \
+                                                                       \
+struct ftrace_event_class event_class_ftrace_##call = {                        \
+       .system                 = __stringify(TRACE_SYSTEM),            \
+       .define_fields          = ftrace_define_fields_##call,          \
+       .raw_init               = ftrace_raw_init_event,                \
+};                                                                     \
                                                                        \
 struct ftrace_event_call __used                                                \
 __attribute__((__aligned__(4)))                                                \
 __attribute__((section("_ftrace_events"))) event_##call = {            \
        .name                   = #call,                                \
-       .id                     = type,                                 \
-       .system                 = __stringify(TRACE_SYSTEM),            \
-       .raw_init               = ftrace_raw_init_event,                \
+       .event.type             = etype,                                \
+       .class                  = &event_class_ftrace_##call,           \
        .print_fmt              = print,                                \
-       .define_fields          = ftrace_define_fields_##call,          \
 };                                                                     \
 
 #include "trace_entries.h"
index 9aed1a5cf553a720e1568de57cec3610ae88c415..79f4bac99a94a767569a8247091eb7fd196fae7b 100644 (file)
@@ -40,7 +40,7 @@ struct fgraph_data {
 #define TRACE_GRAPH_PRINT_OVERHEAD     0x4
 #define TRACE_GRAPH_PRINT_PROC         0x8
 #define TRACE_GRAPH_PRINT_DURATION     0x10
-#define TRACE_GRAPH_PRINT_ABS_TIME     0X20
+#define TRACE_GRAPH_PRINT_ABS_TIME     0x20
 
 static struct tracer_opt trace_opts[] = {
        /* Display overruns? (for self-debug purpose) */
@@ -179,7 +179,7 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
        return ret;
 }
 
-static int __trace_graph_entry(struct trace_array *tr,
+int __trace_graph_entry(struct trace_array *tr,
                                struct ftrace_graph_ent *trace,
                                unsigned long flags,
                                int pc)
@@ -246,7 +246,7 @@ int trace_graph_thresh_entry(struct ftrace_graph_ent *trace)
                return trace_graph_entry(trace);
 }
 
-static void __trace_graph_return(struct trace_array *tr,
+void __trace_graph_return(struct trace_array *tr,
                                struct ftrace_graph_ret *trace,
                                unsigned long flags,
                                int pc)
@@ -490,9 +490,10 @@ get_return_for_leaf(struct trace_iterator *iter,
                         * We need to consume the current entry to see
                         * the next one.
                         */
-                       ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL);
+                       ring_buffer_consume(iter->tr->buffer, iter->cpu,
+                                           NULL, NULL);
                        event = ring_buffer_peek(iter->tr->buffer, iter->cpu,
-                                                NULL);
+                                                NULL, NULL);
                }
 
                if (!event)
@@ -526,17 +527,18 @@ get_return_for_leaf(struct trace_iterator *iter,
 
 /* Signal a overhead of time execution to the output */
 static int
-print_graph_overhead(unsigned long long duration, struct trace_seq *s)
+print_graph_overhead(unsigned long long duration, struct trace_seq *s,
+                    u32 flags)
 {
        /* If duration disappear, we don't need anything */
-       if (!(tracer_flags.val & TRACE_GRAPH_PRINT_DURATION))
+       if (!(flags & TRACE_GRAPH_PRINT_DURATION))
                return 1;
 
        /* Non nested entry or return */
        if (duration == -1)
                return trace_seq_printf(s, "  ");
 
-       if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
+       if (flags & TRACE_GRAPH_PRINT_OVERHEAD) {
                /* Duration exceeded 100 msecs */
                if (duration > 100000ULL)
                        return trace_seq_printf(s, "! ");
@@ -562,7 +564,7 @@ static int print_graph_abs_time(u64 t, struct trace_seq *s)
 
 static enum print_line_t
 print_graph_irq(struct trace_iterator *iter, unsigned long addr,
-               enum trace_type type, int cpu, pid_t pid)
+               enum trace_type type, int cpu, pid_t pid, u32 flags)
 {
        int ret;
        struct trace_seq *s = &iter->seq;
@@ -572,21 +574,21 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
                return TRACE_TYPE_UNHANDLED;
 
        /* Absolute time */
-       if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) {
+       if (flags & TRACE_GRAPH_PRINT_ABS_TIME) {
                ret = print_graph_abs_time(iter->ts, s);
                if (!ret)
                        return TRACE_TYPE_PARTIAL_LINE;
        }
 
        /* Cpu */
-       if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
+       if (flags & TRACE_GRAPH_PRINT_CPU) {
                ret = print_graph_cpu(s, cpu);
                if (ret == TRACE_TYPE_PARTIAL_LINE)
                        return TRACE_TYPE_PARTIAL_LINE;
        }
 
        /* Proc */
-       if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
+       if (flags & TRACE_GRAPH_PRINT_PROC) {
                ret = print_graph_proc(s, pid);
                if (ret == TRACE_TYPE_PARTIAL_LINE)
                        return TRACE_TYPE_PARTIAL_LINE;
@@ -596,7 +598,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
        }
 
        /* No overhead */
-       ret = print_graph_overhead(-1, s);
+       ret = print_graph_overhead(-1, s, flags);
        if (!ret)
                return TRACE_TYPE_PARTIAL_LINE;
 
@@ -609,7 +611,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
                return TRACE_TYPE_PARTIAL_LINE;
 
        /* Don't close the duration column if haven't one */
-       if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
+       if (flags & TRACE_GRAPH_PRINT_DURATION)
                trace_seq_printf(s, " |");
        ret = trace_seq_printf(s, "\n");
 
@@ -679,7 +681,8 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s)
 static enum print_line_t
 print_graph_entry_leaf(struct trace_iterator *iter,
                struct ftrace_graph_ent_entry *entry,
-               struct ftrace_graph_ret_entry *ret_entry, struct trace_seq *s)
+               struct ftrace_graph_ret_entry *ret_entry,
+               struct trace_seq *s, u32 flags)
 {
        struct fgraph_data *data = iter->private;
        struct ftrace_graph_ret *graph_ret;
@@ -711,12 +714,12 @@ print_graph_entry_leaf(struct trace_iterator *iter,
        }
 
        /* Overhead */
-       ret = print_graph_overhead(duration, s);
+       ret = print_graph_overhead(duration, s, flags);
        if (!ret)
                return TRACE_TYPE_PARTIAL_LINE;
 
        /* Duration */
-       if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
+       if (flags & TRACE_GRAPH_PRINT_DURATION) {
                ret = print_graph_duration(duration, s);
                if (ret == TRACE_TYPE_PARTIAL_LINE)
                        return TRACE_TYPE_PARTIAL_LINE;
@@ -739,7 +742,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
 static enum print_line_t
 print_graph_entry_nested(struct trace_iterator *iter,
                         struct ftrace_graph_ent_entry *entry,
-                        struct trace_seq *s, int cpu)
+                        struct trace_seq *s, int cpu, u32 flags)
 {
        struct ftrace_graph_ent *call = &entry->graph_ent;
        struct fgraph_data *data = iter->private;
@@ -759,12 +762,12 @@ print_graph_entry_nested(struct trace_iterator *iter,
        }
 
        /* No overhead */
-       ret = print_graph_overhead(-1, s);
+       ret = print_graph_overhead(-1, s, flags);
        if (!ret)
                return TRACE_TYPE_PARTIAL_LINE;
 
        /* No time */
-       if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
+       if (flags & TRACE_GRAPH_PRINT_DURATION) {
                ret = trace_seq_printf(s, "            |  ");
                if (!ret)
                        return TRACE_TYPE_PARTIAL_LINE;
@@ -790,7 +793,7 @@ print_graph_entry_nested(struct trace_iterator *iter,
 
 static enum print_line_t
 print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
-                    int type, unsigned long addr)
+                    int type, unsigned long addr, u32 flags)
 {
        struct fgraph_data *data = iter->private;
        struct trace_entry *ent = iter->ent;
@@ -803,27 +806,27 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
 
        if (type) {
                /* Interrupt */
-               ret = print_graph_irq(iter, addr, type, cpu, ent->pid);
+               ret = print_graph_irq(iter, addr, type, cpu, ent->pid, flags);
                if (ret == TRACE_TYPE_PARTIAL_LINE)
                        return TRACE_TYPE_PARTIAL_LINE;
        }
 
        /* Absolute time */
-       if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) {
+       if (flags & TRACE_GRAPH_PRINT_ABS_TIME) {
                ret = print_graph_abs_time(iter->ts, s);
                if (!ret)
                        return TRACE_TYPE_PARTIAL_LINE;
        }
 
        /* Cpu */
-       if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
+       if (flags & TRACE_GRAPH_PRINT_CPU) {
                ret = print_graph_cpu(s, cpu);
                if (ret == TRACE_TYPE_PARTIAL_LINE)
                        return TRACE_TYPE_PARTIAL_LINE;
        }
 
        /* Proc */
-       if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
+       if (flags & TRACE_GRAPH_PRINT_PROC) {
                ret = print_graph_proc(s, ent->pid);
                if (ret == TRACE_TYPE_PARTIAL_LINE)
                        return TRACE_TYPE_PARTIAL_LINE;
@@ -845,7 +848,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
 
 static enum print_line_t
 print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
-                       struct trace_iterator *iter)
+                       struct trace_iterator *iter, u32 flags)
 {
        struct fgraph_data *data = iter->private;
        struct ftrace_graph_ent *call = &field->graph_ent;
@@ -853,14 +856,14 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
        static enum print_line_t ret;
        int cpu = iter->cpu;
 
-       if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func))
+       if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags))
                return TRACE_TYPE_PARTIAL_LINE;
 
        leaf_ret = get_return_for_leaf(iter, field);
        if (leaf_ret)
-               ret = print_graph_entry_leaf(iter, field, leaf_ret, s);
+               ret = print_graph_entry_leaf(iter, field, leaf_ret, s, flags);
        else
-               ret = print_graph_entry_nested(iter, field, s, cpu);
+               ret = print_graph_entry_nested(iter, field, s, cpu, flags);
 
        if (data) {
                /*
@@ -879,7 +882,8 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
 
 static enum print_line_t
 print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
-                  struct trace_entry *ent, struct trace_iterator *iter)
+                  struct trace_entry *ent, struct trace_iterator *iter,
+                  u32 flags)
 {
        unsigned long long duration = trace->rettime - trace->calltime;
        struct fgraph_data *data = iter->private;
@@ -909,16 +913,16 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
                }
        }
 
-       if (print_graph_prologue(iter, s, 0, 0))
+       if (print_graph_prologue(iter, s, 0, 0, flags))
                return TRACE_TYPE_PARTIAL_LINE;
 
        /* Overhead */
-       ret = print_graph_overhead(duration, s);
+       ret = print_graph_overhead(duration, s, flags);
        if (!ret)
                return TRACE_TYPE_PARTIAL_LINE;
 
        /* Duration */
-       if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
+       if (flags & TRACE_GRAPH_PRINT_DURATION) {
                ret = print_graph_duration(duration, s);
                if (ret == TRACE_TYPE_PARTIAL_LINE)
                        return TRACE_TYPE_PARTIAL_LINE;
@@ -948,14 +952,15 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
        }
 
        /* Overrun */
-       if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) {
+       if (flags & TRACE_GRAPH_PRINT_OVERRUN) {
                ret = trace_seq_printf(s, " (Overruns: %lu)\n",
                                        trace->overrun);
                if (!ret)
                        return TRACE_TYPE_PARTIAL_LINE;
        }
 
-       ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET, cpu, pid);
+       ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET,
+                             cpu, pid, flags);
        if (ret == TRACE_TYPE_PARTIAL_LINE)
                return TRACE_TYPE_PARTIAL_LINE;
 
@@ -963,8 +968,8 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
 }
 
 static enum print_line_t
-print_graph_comment(struct trace_seq *s,  struct trace_entry *ent,
-                   struct trace_iterator *iter)
+print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
+                   struct trace_iterator *iter, u32 flags)
 {
        unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
        struct fgraph_data *data = iter->private;
@@ -976,16 +981,16 @@ print_graph_comment(struct trace_seq *s,  struct trace_entry *ent,
        if (data)
                depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth;
 
-       if (print_graph_prologue(iter, s, 0, 0))
+       if (print_graph_prologue(iter, s, 0, 0, flags))
                return TRACE_TYPE_PARTIAL_LINE;
 
        /* No overhead */
-       ret = print_graph_overhead(-1, s);
+       ret = print_graph_overhead(-1, s, flags);
        if (!ret)
                return TRACE_TYPE_PARTIAL_LINE;
 
        /* No time */
-       if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
+       if (flags & TRACE_GRAPH_PRINT_DURATION) {
                ret = trace_seq_printf(s, "            |  ");
                if (!ret)
                        return TRACE_TYPE_PARTIAL_LINE;
@@ -1020,7 +1025,7 @@ print_graph_comment(struct trace_seq *s,  struct trace_entry *ent,
                if (!event)
                        return TRACE_TYPE_UNHANDLED;
 
-               ret = event->trace(iter, sym_flags);
+               ret = event->funcs->trace(iter, sym_flags, event);
                if (ret != TRACE_TYPE_HANDLED)
                        return ret;
        }
@@ -1040,7 +1045,7 @@ print_graph_comment(struct trace_seq *s,  struct trace_entry *ent,
 
 
 enum print_line_t
-print_graph_function(struct trace_iterator *iter)
+print_graph_function_flags(struct trace_iterator *iter, u32 flags)
 {
        struct ftrace_graph_ent_entry *field;
        struct fgraph_data *data = iter->private;
@@ -1061,7 +1066,7 @@ print_graph_function(struct trace_iterator *iter)
        if (data && data->failed) {
                field = &data->ent;
                iter->cpu = data->cpu;
-               ret = print_graph_entry(field, s, iter);
+               ret = print_graph_entry(field, s, iter, flags);
                if (ret == TRACE_TYPE_HANDLED && iter->cpu != cpu) {
                        per_cpu_ptr(data->cpu_data, iter->cpu)->ignore = 1;
                        ret = TRACE_TYPE_NO_CONSUME;
@@ -1081,32 +1086,50 @@ print_graph_function(struct trace_iterator *iter)
                struct ftrace_graph_ent_entry saved;
                trace_assign_type(field, entry);
                saved = *field;
-               return print_graph_entry(&saved, s, iter);
+               return print_graph_entry(&saved, s, iter, flags);
        }
        case TRACE_GRAPH_RET: {
                struct ftrace_graph_ret_entry *field;
                trace_assign_type(field, entry);
-               return print_graph_return(&field->ret, s, entry, iter);
+               return print_graph_return(&field->ret, s, entry, iter, flags);
        }
+       case TRACE_STACK:
+       case TRACE_FN:
+               /* dont trace stack and functions as comments */
+               return TRACE_TYPE_UNHANDLED;
+
        default:
-               return print_graph_comment(s, entry, iter);
+               return print_graph_comment(s, entry, iter, flags);
        }
 
        return TRACE_TYPE_HANDLED;
 }
 
-static void print_lat_header(struct seq_file *s)
+static enum print_line_t
+print_graph_function(struct trace_iterator *iter)
+{
+       return print_graph_function_flags(iter, tracer_flags.val);
+}
+
+static enum print_line_t
+print_graph_function_event(struct trace_iterator *iter, int flags,
+                          struct trace_event *event)
+{
+       return print_graph_function(iter);
+}
+
+static void print_lat_header(struct seq_file *s, u32 flags)
 {
        static const char spaces[] = "                " /* 16 spaces */
                "    "                                  /* 4 spaces */
                "                 ";                    /* 17 spaces */
        int size = 0;
 
-       if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
+       if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
                size += 16;
-       if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
+       if (flags & TRACE_GRAPH_PRINT_CPU)
                size += 4;
-       if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
+       if (flags & TRACE_GRAPH_PRINT_PROC)
                size += 17;
 
        seq_printf(s, "#%.*s  _-----=> irqs-off        \n", size, spaces);
@@ -1117,43 +1140,48 @@ static void print_lat_header(struct seq_file *s)
        seq_printf(s, "#%.*s|||| /                     \n", size, spaces);
 }
 
-static void print_graph_headers(struct seq_file *s)
+void print_graph_headers_flags(struct seq_file *s, u32 flags)
 {
        int lat = trace_flags & TRACE_ITER_LATENCY_FMT;
 
        if (lat)
-               print_lat_header(s);
+               print_lat_header(s, flags);
 
        /* 1st line */
        seq_printf(s, "#");
-       if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
+       if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
                seq_printf(s, "     TIME       ");
-       if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
+       if (flags & TRACE_GRAPH_PRINT_CPU)
                seq_printf(s, " CPU");
-       if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
+       if (flags & TRACE_GRAPH_PRINT_PROC)
                seq_printf(s, "  TASK/PID       ");
        if (lat)
                seq_printf(s, "|||||");
-       if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
+       if (flags & TRACE_GRAPH_PRINT_DURATION)
                seq_printf(s, "  DURATION   ");
        seq_printf(s, "               FUNCTION CALLS\n");
 
        /* 2nd line */
        seq_printf(s, "#");
-       if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
+       if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
                seq_printf(s, "      |         ");
-       if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
+       if (flags & TRACE_GRAPH_PRINT_CPU)
                seq_printf(s, " |  ");
-       if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
+       if (flags & TRACE_GRAPH_PRINT_PROC)
                seq_printf(s, "   |    |        ");
        if (lat)
                seq_printf(s, "|||||");
-       if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
+       if (flags & TRACE_GRAPH_PRINT_DURATION)
                seq_printf(s, "   |   |      ");
        seq_printf(s, "               |   |   |   |\n");
 }
 
-static void graph_trace_open(struct trace_iterator *iter)
+void print_graph_headers(struct seq_file *s)
+{
+       print_graph_headers_flags(s, tracer_flags.val);
+}
+
+void graph_trace_open(struct trace_iterator *iter)
 {
        /* pid and depth on the last trace processed */
        struct fgraph_data *data;
@@ -1188,7 +1216,7 @@ static void graph_trace_open(struct trace_iterator *iter)
        pr_warning("function graph tracer: not enough memory\n");
 }
 
-static void graph_trace_close(struct trace_iterator *iter)
+void graph_trace_close(struct trace_iterator *iter)
 {
        struct fgraph_data *data = iter->private;
 
@@ -1198,6 +1226,20 @@ static void graph_trace_close(struct trace_iterator *iter)
        }
 }
 
+static struct trace_event_functions graph_functions = {
+       .trace          = print_graph_function_event,
+};
+
+static struct trace_event graph_trace_entry_event = {
+       .type           = TRACE_GRAPH_ENT,
+       .funcs          = &graph_functions,
+};
+
+static struct trace_event graph_trace_ret_event = {
+       .type           = TRACE_GRAPH_RET,
+       .funcs          = &graph_functions
+};
+
 static struct tracer graph_trace __read_mostly = {
        .name           = "function_graph",
        .open           = graph_trace_open,
@@ -1219,6 +1261,16 @@ static __init int init_graph_trace(void)
 {
        max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1);
 
+       if (!register_ftrace_event(&graph_trace_entry_event)) {
+               pr_warning("Warning: could not register graph trace events\n");
+               return 1;
+       }
+
+       if (!register_ftrace_event(&graph_trace_ret_event)) {
+               pr_warning("Warning: could not register graph trace events\n");
+               return 1;
+       }
+
        return register_tracer(&graph_trace);
 }
 
index 2974bc7538c74603b15f85b75a62f74e70f7bb4d..6fd486e0cef407b1cf3c416d15c31be14bf8dd6a 100644 (file)
@@ -34,6 +34,9 @@ static int trace_type __read_mostly;
 
 static int save_lat_flag;
 
+static void stop_irqsoff_tracer(struct trace_array *tr, int graph);
+static int start_irqsoff_tracer(struct trace_array *tr, int graph);
+
 #ifdef CONFIG_PREEMPT_TRACER
 static inline int
 preempt_trace(void)
@@ -55,6 +58,23 @@ irq_trace(void)
 # define irq_trace() (0)
 #endif
 
+#define TRACE_DISPLAY_GRAPH    1
+
+static struct tracer_opt trace_opts[] = {
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+       /* display latency trace as call graph */
+       { TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) },
+#endif
+       { } /* Empty entry */
+};
+
+static struct tracer_flags tracer_flags = {
+       .val  = 0,
+       .opts = trace_opts,
+};
+
+#define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH)
+
 /*
  * Sequence count - we record it when starting a measurement and
  * skip the latency if the sequence has changed - some other section
@@ -108,6 +128,202 @@ static struct ftrace_ops trace_ops __read_mostly =
 };
 #endif /* CONFIG_FUNCTION_TRACER */
 
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static int irqsoff_set_flag(u32 old_flags, u32 bit, int set)
+{
+       int cpu;
+
+       if (!(bit & TRACE_DISPLAY_GRAPH))
+               return -EINVAL;
+
+       if (!(is_graph() ^ set))
+               return 0;
+
+       stop_irqsoff_tracer(irqsoff_trace, !set);
+
+       for_each_possible_cpu(cpu)
+               per_cpu(tracing_cpu, cpu) = 0;
+
+       tracing_max_latency = 0;
+       tracing_reset_online_cpus(irqsoff_trace);
+
+       return start_irqsoff_tracer(irqsoff_trace, set);
+}
+
+static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
+{
+       struct trace_array *tr = irqsoff_trace;
+       struct trace_array_cpu *data;
+       unsigned long flags;
+       long disabled;
+       int ret;
+       int cpu;
+       int pc;
+
+       cpu = raw_smp_processor_id();
+       if (likely(!per_cpu(tracing_cpu, cpu)))
+               return 0;
+
+       local_save_flags(flags);
+       /* slight chance to get a false positive on tracing_cpu */
+       if (!irqs_disabled_flags(flags))
+               return 0;
+
+       data = tr->data[cpu];
+       disabled = atomic_inc_return(&data->disabled);
+
+       if (likely(disabled == 1)) {
+               pc = preempt_count();
+               ret = __trace_graph_entry(tr, trace, flags, pc);
+       } else
+               ret = 0;
+
+       atomic_dec(&data->disabled);
+       return ret;
+}
+
+static void irqsoff_graph_return(struct ftrace_graph_ret *trace)
+{
+       struct trace_array *tr = irqsoff_trace;
+       struct trace_array_cpu *data;
+       unsigned long flags;
+       long disabled;
+       int cpu;
+       int pc;
+
+       cpu = raw_smp_processor_id();
+       if (likely(!per_cpu(tracing_cpu, cpu)))
+               return;
+
+       local_save_flags(flags);
+       /* slight chance to get a false positive on tracing_cpu */
+       if (!irqs_disabled_flags(flags))
+               return;
+
+       data = tr->data[cpu];
+       disabled = atomic_inc_return(&data->disabled);
+
+       if (likely(disabled == 1)) {
+               pc = preempt_count();
+               __trace_graph_return(tr, trace, flags, pc);
+       }
+
+       atomic_dec(&data->disabled);
+}
+
+static void irqsoff_trace_open(struct trace_iterator *iter)
+{
+       if (is_graph())
+               graph_trace_open(iter);
+
+}
+
+static void irqsoff_trace_close(struct trace_iterator *iter)
+{
+       if (iter->private)
+               graph_trace_close(iter);
+}
+
+#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_CPU | \
+                           TRACE_GRAPH_PRINT_PROC)
+
+static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
+{
+       u32 flags = GRAPH_TRACER_FLAGS;
+
+       if (trace_flags & TRACE_ITER_LATENCY_FMT)
+               flags |= TRACE_GRAPH_PRINT_DURATION;
+       else
+               flags |= TRACE_GRAPH_PRINT_ABS_TIME;
+
+       /*
+        * In graph mode call the graph tracer output function,
+        * otherwise go with the TRACE_FN event handler
+        */
+       if (is_graph())
+               return print_graph_function_flags(iter, flags);
+
+       return TRACE_TYPE_UNHANDLED;
+}
+
+static void irqsoff_print_header(struct seq_file *s)
+{
+       if (is_graph()) {
+               struct trace_iterator *iter = s->private;
+               u32 flags = GRAPH_TRACER_FLAGS;
+
+               if (trace_flags & TRACE_ITER_LATENCY_FMT) {
+                       /* print nothing if the buffers are empty */
+                       if (trace_empty(iter))
+                               return;
+
+                       print_trace_header(s, iter);
+                       flags |= TRACE_GRAPH_PRINT_DURATION;
+               } else
+                       flags |= TRACE_GRAPH_PRINT_ABS_TIME;
+
+               print_graph_headers_flags(s, flags);
+       } else
+               trace_default_header(s);
+}
+
+static void
+trace_graph_function(struct trace_array *tr,
+                unsigned long ip, unsigned long flags, int pc)
+{
+       u64 time = trace_clock_local();
+       struct ftrace_graph_ent ent = {
+               .func  = ip,
+               .depth = 0,
+       };
+       struct ftrace_graph_ret ret = {
+               .func     = ip,
+               .depth    = 0,
+               .calltime = time,
+               .rettime  = time,
+       };
+
+       __trace_graph_entry(tr, &ent, flags, pc);
+       __trace_graph_return(tr, &ret, flags, pc);
+}
+
+static void
+__trace_function(struct trace_array *tr,
+                unsigned long ip, unsigned long parent_ip,
+                unsigned long flags, int pc)
+{
+       if (!is_graph())
+               trace_function(tr, ip, parent_ip, flags, pc);
+       else {
+               trace_graph_function(tr, parent_ip, flags, pc);
+               trace_graph_function(tr, ip, flags, pc);
+       }
+}
+
+#else
+#define __trace_function trace_function
+
+static int irqsoff_set_flag(u32 old_flags, u32 bit, int set)
+{
+       return -EINVAL;
+}
+
+static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
+{
+       return -1;
+}
+
+static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
+{
+       return TRACE_TYPE_UNHANDLED;
+}
+
+static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { }
+static void irqsoff_print_header(struct seq_file *s) { }
+static void irqsoff_trace_open(struct trace_iterator *iter) { }
+static void irqsoff_trace_close(struct trace_iterator *iter) { }
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
 /*
  * Should this new latency be reported/recorded?
  */
@@ -150,7 +366,7 @@ check_critical_timing(struct trace_array *tr,
        if (!report_latency(delta))
                goto out_unlock;
 
-       trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
+       __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
        /* Skip 5 functions to get to the irq/preempt enable function */
        __trace_stack(tr, flags, 5, pc);
 
@@ -172,7 +388,7 @@ out_unlock:
 out:
        data->critical_sequence = max_sequence;
        data->preempt_timestamp = ftrace_now(cpu);
-       trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
+       __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
 }
 
 static inline void
@@ -204,7 +420,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
 
        local_save_flags(flags);
 
-       trace_function(tr, ip, parent_ip, flags, preempt_count());
+       __trace_function(tr, ip, parent_ip, flags, preempt_count());
 
        per_cpu(tracing_cpu, cpu) = 1;
 
@@ -238,7 +454,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
        atomic_inc(&data->disabled);
 
        local_save_flags(flags);
-       trace_function(tr, ip, parent_ip, flags, preempt_count());
+       __trace_function(tr, ip, parent_ip, flags, preempt_count());
        check_critical_timing(tr, data, parent_ip ? : ip, cpu);
        data->critical_start = 0;
        atomic_dec(&data->disabled);
@@ -347,19 +563,32 @@ void trace_preempt_off(unsigned long a0, unsigned long a1)
 }
 #endif /* CONFIG_PREEMPT_TRACER */
 
-static void start_irqsoff_tracer(struct trace_array *tr)
+static int start_irqsoff_tracer(struct trace_array *tr, int graph)
 {
-       register_ftrace_function(&trace_ops);
-       if (tracing_is_enabled())
+       int ret = 0;
+
+       if (!graph)
+               ret = register_ftrace_function(&trace_ops);
+       else
+               ret = register_ftrace_graph(&irqsoff_graph_return,
+                                           &irqsoff_graph_entry);
+
+       if (!ret && tracing_is_enabled())
                tracer_enabled = 1;
        else
                tracer_enabled = 0;
+
+       return ret;
 }
 
-static void stop_irqsoff_tracer(struct trace_array *tr)
+static void stop_irqsoff_tracer(struct trace_array *tr, int graph)
 {
        tracer_enabled = 0;
-       unregister_ftrace_function(&trace_ops);
+
+       if (!graph)
+               unregister_ftrace_function(&trace_ops);
+       else
+               unregister_ftrace_graph();
 }
 
 static void __irqsoff_tracer_init(struct trace_array *tr)
@@ -372,12 +601,14 @@ static void __irqsoff_tracer_init(struct trace_array *tr)
        /* make sure that the tracer is visible */
        smp_wmb();
        tracing_reset_online_cpus(tr);
-       start_irqsoff_tracer(tr);
+
+       if (start_irqsoff_tracer(tr, is_graph()))
+               printk(KERN_ERR "failed to start irqsoff tracer\n");
 }
 
 static void irqsoff_tracer_reset(struct trace_array *tr)
 {
-       stop_irqsoff_tracer(tr);
+       stop_irqsoff_tracer(tr, is_graph());
 
        if (!save_lat_flag)
                trace_flags &= ~TRACE_ITER_LATENCY_FMT;
@@ -409,9 +640,15 @@ static struct tracer irqsoff_tracer __read_mostly =
        .start          = irqsoff_tracer_start,
        .stop           = irqsoff_tracer_stop,
        .print_max      = 1,
+       .print_header   = irqsoff_print_header,
+       .print_line     = irqsoff_print_line,
+       .flags          = &tracer_flags,
+       .set_flag       = irqsoff_set_flag,
 #ifdef CONFIG_FTRACE_SELFTEST
        .selftest    = trace_selftest_startup_irqsoff,
 #endif
+       .open           = irqsoff_trace_open,
+       .close          = irqsoff_trace_close,
 };
 # define register_irqsoff(trace) register_tracer(&trace)
 #else
@@ -435,9 +672,15 @@ static struct tracer preemptoff_tracer __read_mostly =
        .start          = irqsoff_tracer_start,
        .stop           = irqsoff_tracer_stop,
        .print_max      = 1,
+       .print_header   = irqsoff_print_header,
+       .print_line     = irqsoff_print_line,
+       .flags          = &tracer_flags,
+       .set_flag       = irqsoff_set_flag,
 #ifdef CONFIG_FTRACE_SELFTEST
        .selftest    = trace_selftest_startup_preemptoff,
 #endif
+       .open           = irqsoff_trace_open,
+       .close          = irqsoff_trace_close,
 };
 # define register_preemptoff(trace) register_tracer(&trace)
 #else
@@ -463,9 +706,15 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
        .start          = irqsoff_tracer_start,
        .stop           = irqsoff_tracer_stop,
        .print_max      = 1,
+       .print_header   = irqsoff_print_header,
+       .print_line     = irqsoff_print_line,
+       .flags          = &tracer_flags,
+       .set_flag       = irqsoff_set_flag,
 #ifdef CONFIG_FTRACE_SELFTEST
        .selftest    = trace_selftest_startup_preemptirqsoff,
 #endif
+       .open           = irqsoff_trace_open,
+       .close          = irqsoff_trace_close,
 };
 
 # define register_preemptirqsoff(trace) register_tracer(&trace)
index a7514326052b658b88e69029a7754e197b7c2f56..9a082bba95379d89c9aa04a140b6c0b0fd555fa8 100644 (file)
@@ -324,8 +324,8 @@ struct trace_probe {
        unsigned long           nhit;
        unsigned int            flags;  /* For TP_FLAG_* */
        const char              *symbol;        /* symbol name */
+       struct ftrace_event_class       class;
        struct ftrace_event_call        call;
-       struct trace_event              event;
        ssize_t                 size;           /* trace entry size */
        unsigned int            nr_args;
        struct probe_arg        args[];
@@ -404,6 +404,7 @@ static struct trace_probe *alloc_trace_probe(const char *group,
                goto error;
        }
 
+       tp->call.class = &tp->class;
        tp->call.name = kstrdup(event, GFP_KERNEL);
        if (!tp->call.name)
                goto error;
@@ -413,8 +414,8 @@ static struct trace_probe *alloc_trace_probe(const char *group,
                goto error;
        }
 
-       tp->call.system = kstrdup(group, GFP_KERNEL);
-       if (!tp->call.system)
+       tp->class.system = kstrdup(group, GFP_KERNEL);
+       if (!tp->class.system)
                goto error;
 
        INIT_LIST_HEAD(&tp->list);
@@ -443,7 +444,7 @@ static void free_trace_probe(struct trace_probe *tp)
        for (i = 0; i < tp->nr_args; i++)
                free_probe_arg(&tp->args[i]);
 
-       kfree(tp->call.system);
+       kfree(tp->call.class->system);
        kfree(tp->call.name);
        kfree(tp->symbol);
        kfree(tp);
@@ -456,7 +457,7 @@ static struct trace_probe *find_probe_event(const char *event,
 
        list_for_each_entry(tp, &probe_list, list)
                if (strcmp(tp->call.name, event) == 0 &&
-                   strcmp(tp->call.system, group) == 0)
+                   strcmp(tp->call.class->system, group) == 0)
                        return tp;
        return NULL;
 }
@@ -481,7 +482,7 @@ static int register_trace_probe(struct trace_probe *tp)
        mutex_lock(&probe_lock);
 
        /* register as an event */
-       old_tp = find_probe_event(tp->call.name, tp->call.system);
+       old_tp = find_probe_event(tp->call.name, tp->call.class->system);
        if (old_tp) {
                /* delete old event */
                unregister_trace_probe(old_tp);
@@ -904,7 +905,7 @@ static int probes_seq_show(struct seq_file *m, void *v)
        int i;
 
        seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p');
-       seq_printf(m, ":%s/%s", tp->call.system, tp->call.name);
+       seq_printf(m, ":%s/%s", tp->call.class->system, tp->call.name);
 
        if (!tp->symbol)
                seq_printf(m, " 0x%p", tp->rp.kp.addr);
@@ -1061,8 +1062,8 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
 
        size = sizeof(*entry) + tp->size;
 
-       event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
-                                                 irq_flags, pc);
+       event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
+                                                 size, irq_flags, pc);
        if (!event)
                return;
 
@@ -1094,8 +1095,8 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
 
        size = sizeof(*entry) + tp->size;
 
-       event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
-                                                 irq_flags, pc);
+       event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
+                                                 size, irq_flags, pc);
        if (!event)
                return;
 
@@ -1112,18 +1113,17 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
 
 /* Event entry printers */
 enum print_line_t
-print_kprobe_event(struct trace_iterator *iter, int flags)
+print_kprobe_event(struct trace_iterator *iter, int flags,
+                  struct trace_event *event)
 {
        struct kprobe_trace_entry_head *field;
        struct trace_seq *s = &iter->seq;
-       struct trace_event *event;
        struct trace_probe *tp;
        u8 *data;
        int i;
 
        field = (struct kprobe_trace_entry_head *)iter->ent;
-       event = ftrace_find_event(field->ent.type);
-       tp = container_of(event, struct trace_probe, event);
+       tp = container_of(event, struct trace_probe, call.event);
 
        if (!trace_seq_printf(s, "%s: (", tp->call.name))
                goto partial;
@@ -1149,18 +1149,17 @@ partial:
 }
 
 enum print_line_t
-print_kretprobe_event(struct trace_iterator *iter, int flags)
+print_kretprobe_event(struct trace_iterator *iter, int flags,
+                     struct trace_event *event)
 {
        struct kretprobe_trace_entry_head *field;
        struct trace_seq *s = &iter->seq;
-       struct trace_event *event;
        struct trace_probe *tp;
        u8 *data;
        int i;
 
        field = (struct kretprobe_trace_entry_head *)iter->ent;
-       event = ftrace_find_event(field->ent.type);
-       tp = container_of(event, struct trace_probe, event);
+       tp = container_of(event, struct trace_probe, call.event);
 
        if (!trace_seq_printf(s, "%s: (", tp->call.name))
                goto partial;
@@ -1217,8 +1216,6 @@ static void probe_event_disable(struct ftrace_event_call *call)
 
 static int probe_event_raw_init(struct ftrace_event_call *event_call)
 {
-       INIT_LIST_HEAD(&event_call->fields);
-
        return 0;
 }
 
@@ -1353,7 +1350,8 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,
                     "profile buffer not large enough"))
                return;
 
-       entry = perf_trace_buf_prepare(size, call->id, &rctx, &irq_flags);
+       entry = perf_trace_buf_prepare(size, call->event.type,
+                                      &rctx, &irq_flags);
        if (!entry)
                return;
 
@@ -1384,7 +1382,8 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
                     "profile buffer not large enough"))
                return;
 
-       entry = perf_trace_buf_prepare(size, call->id, &rctx, &irq_flags);
+       entry = perf_trace_buf_prepare(size, call->event.type,
+                                      &rctx, &irq_flags);
        if (!entry)
                return;
 
@@ -1425,6 +1424,26 @@ static void probe_perf_disable(struct ftrace_event_call *call)
 }
 #endif /* CONFIG_PERF_EVENTS */
 
+static __kprobes
+int kprobe_register(struct ftrace_event_call *event, enum trace_reg type)
+{
+       switch (type) {
+       case TRACE_REG_REGISTER:
+               return probe_event_enable(event);
+       case TRACE_REG_UNREGISTER:
+               probe_event_disable(event);
+               return 0;
+
+#ifdef CONFIG_PERF_EVENTS
+       case TRACE_REG_PERF_REGISTER:
+               return probe_perf_enable(event);
+       case TRACE_REG_PERF_UNREGISTER:
+               probe_perf_disable(event);
+               return 0;
+#endif
+       }
+       return 0;
+}
 
 static __kprobes
 int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
@@ -1454,6 +1473,14 @@ int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
        return 0;       /* We don't tweek kernel, so just return 0 */
 }
 
+static struct trace_event_functions kretprobe_funcs = {
+       .trace          = print_kretprobe_event
+};
+
+static struct trace_event_functions kprobe_funcs = {
+       .trace          = print_kprobe_event
+};
+
 static int register_probe_event(struct trace_probe *tp)
 {
        struct ftrace_event_call *call = &tp->call;
@@ -1461,36 +1488,31 @@ static int register_probe_event(struct trace_probe *tp)
 
        /* Initialize ftrace_event_call */
        if (probe_is_return(tp)) {
-               tp->event.trace = print_kretprobe_event;
-               call->raw_init = probe_event_raw_init;
-               call->define_fields = kretprobe_event_define_fields;
+               INIT_LIST_HEAD(&call->class->fields);
+               call->event.funcs = &kretprobe_funcs;
+               call->class->raw_init = probe_event_raw_init;
+               call->class->define_fields = kretprobe_event_define_fields;
        } else {
-               tp->event.trace = print_kprobe_event;
-               call->raw_init = probe_event_raw_init;
-               call->define_fields = kprobe_event_define_fields;
+               INIT_LIST_HEAD(&call->class->fields);
+               call->event.funcs = &kprobe_funcs;
+               call->class->raw_init = probe_event_raw_init;
+               call->class->define_fields = kprobe_event_define_fields;
        }
        if (set_print_fmt(tp) < 0)
                return -ENOMEM;
-       call->event = &tp->event;
-       call->id = register_ftrace_event(&tp->event);
-       if (!call->id) {
+       ret = register_ftrace_event(&call->event);
+       if (!ret) {
                kfree(call->print_fmt);
                return -ENODEV;
        }
-       call->enabled = 0;
-       call->regfunc = probe_event_enable;
-       call->unregfunc = probe_event_disable;
-
-#ifdef CONFIG_PERF_EVENTS
-       call->perf_event_enable = probe_perf_enable;
-       call->perf_event_disable = probe_perf_disable;
-#endif
+       call->flags = 0;
+       call->class->reg = kprobe_register;
        call->data = tp;
        ret = trace_add_event_call(call);
        if (ret) {
                pr_info("Failed to register kprobe event: %s\n", call->name);
                kfree(call->print_fmt);
-               unregister_ftrace_event(&tp->event);
+               unregister_ftrace_event(&call->event);
        }
        return ret;
 }
index 8e46b3323cdcdd91ad9833aa1eb8d0f1bf82f3af..fc9d4dbb089e9067605e327019ab8846a05752cf 100644 (file)
@@ -253,7 +253,7 @@ void *trace_seq_reserve(struct trace_seq *s, size_t len)
        void *ret;
 
        if (s->full)
-               return 0;
+               return NULL;
 
        if (len > ((PAGE_SIZE - 1) - s->len)) {
                s->full = 1;
@@ -726,6 +726,9 @@ int register_ftrace_event(struct trace_event *event)
        if (WARN_ON(!event))
                goto out;
 
+       if (WARN_ON(!event->funcs))
+               goto out;
+
        INIT_LIST_HEAD(&event->list);
 
        if (!event->type) {
@@ -758,14 +761,14 @@ int register_ftrace_event(struct trace_event *event)
                        goto out;
        }
 
-       if (event->trace == NULL)
-               event->trace = trace_nop_print;
-       if (event->raw == NULL)
-               event->raw = trace_nop_print;
-       if (event->hex == NULL)
-               event->hex = trace_nop_print;
-       if (event->binary == NULL)
-               event->binary = trace_nop_print;
+       if (event->funcs->trace == NULL)
+               event->funcs->trace = trace_nop_print;
+       if (event->funcs->raw == NULL)
+               event->funcs->raw = trace_nop_print;
+       if (event->funcs->hex == NULL)
+               event->funcs->hex = trace_nop_print;
+       if (event->funcs->binary == NULL)
+               event->funcs->binary = trace_nop_print;
 
        key = event->type & (EVENT_HASHSIZE - 1);
 
@@ -807,13 +810,15 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_event);
  * Standard events
  */
 
-enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags)
+enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags,
+                                 struct trace_event *event)
 {
        return TRACE_TYPE_HANDLED;
 }
 
 /* TRACE_FN */
-static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags,
+                                       struct trace_event *event)
 {
        struct ftrace_entry *field;
        struct trace_seq *s = &iter->seq;
@@ -840,7 +845,8 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags)
        return TRACE_TYPE_PARTIAL_LINE;
 }
 
-static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags,
+                                     struct trace_event *event)
 {
        struct ftrace_entry *field;
 
@@ -854,7 +860,8 @@ static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags)
        return TRACE_TYPE_HANDLED;
 }
 
-static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags,
+                                     struct trace_event *event)
 {
        struct ftrace_entry *field;
        struct trace_seq *s = &iter->seq;
@@ -867,7 +874,8 @@ static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags)
        return TRACE_TYPE_HANDLED;
 }
 
-static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags,
+                                     struct trace_event *event)
 {
        struct ftrace_entry *field;
        struct trace_seq *s = &iter->seq;
@@ -880,14 +888,18 @@ static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags)
        return TRACE_TYPE_HANDLED;
 }
 
-static struct trace_event trace_fn_event = {
-       .type           = TRACE_FN,
+static struct trace_event_functions trace_fn_funcs = {
        .trace          = trace_fn_trace,
        .raw            = trace_fn_raw,
        .hex            = trace_fn_hex,
        .binary         = trace_fn_bin,
 };
 
+static struct trace_event trace_fn_event = {
+       .type           = TRACE_FN,
+       .funcs          = &trace_fn_funcs,
+};
+
 /* TRACE_CTX an TRACE_WAKE */
 static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
                                             char *delim)
@@ -916,13 +928,14 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
        return TRACE_TYPE_HANDLED;
 }
 
-static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags,
+                                        struct trace_event *event)
 {
        return trace_ctxwake_print(iter, "==>");
 }
 
 static enum print_line_t trace_wake_print(struct trace_iterator *iter,
-                                         int flags)
+                                         int flags, struct trace_event *event)
 {
        return trace_ctxwake_print(iter, "  +");
 }
@@ -950,12 +963,14 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S)
        return TRACE_TYPE_HANDLED;
 }
 
-static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags,
+                                      struct trace_event *event)
 {
        return trace_ctxwake_raw(iter, 0);
 }
 
-static enum print_line_t trace_wake_raw(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_wake_raw(struct trace_iterator *iter, int flags,
+                                       struct trace_event *event)
 {
        return trace_ctxwake_raw(iter, '+');
 }
@@ -984,18 +999,20 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S)
        return TRACE_TYPE_HANDLED;
 }
 
-static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags,
+                                      struct trace_event *event)
 {
        return trace_ctxwake_hex(iter, 0);
 }
 
-static enum print_line_t trace_wake_hex(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_wake_hex(struct trace_iterator *iter, int flags,
+                                       struct trace_event *event)
 {
        return trace_ctxwake_hex(iter, '+');
 }
 
 static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter,
-                                          int flags)
+                                          int flags, struct trace_event *event)
 {
        struct ctx_switch_entry *field;
        struct trace_seq *s = &iter->seq;
@@ -1012,25 +1029,33 @@ static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter,
        return TRACE_TYPE_HANDLED;
 }
 
-static struct trace_event trace_ctx_event = {
-       .type           = TRACE_CTX,
+static struct trace_event_functions trace_ctx_funcs = {
        .trace          = trace_ctx_print,
        .raw            = trace_ctx_raw,
        .hex            = trace_ctx_hex,
        .binary         = trace_ctxwake_bin,
 };
 
-static struct trace_event trace_wake_event = {
-       .type           = TRACE_WAKE,
+static struct trace_event trace_ctx_event = {
+       .type           = TRACE_CTX,
+       .funcs          = &trace_ctx_funcs,
+};
+
+static struct trace_event_functions trace_wake_funcs = {
        .trace          = trace_wake_print,
        .raw            = trace_wake_raw,
        .hex            = trace_wake_hex,
        .binary         = trace_ctxwake_bin,
 };
 
+static struct trace_event trace_wake_event = {
+       .type           = TRACE_WAKE,
+       .funcs          = &trace_wake_funcs,
+};
+
 /* TRACE_SPECIAL */
 static enum print_line_t trace_special_print(struct trace_iterator *iter,
-                                            int flags)
+                                            int flags, struct trace_event *event)
 {
        struct special_entry *field;
 
@@ -1046,7 +1071,7 @@ static enum print_line_t trace_special_print(struct trace_iterator *iter,
 }
 
 static enum print_line_t trace_special_hex(struct trace_iterator *iter,
-                                          int flags)
+                                          int flags, struct trace_event *event)
 {
        struct special_entry *field;
        struct trace_seq *s = &iter->seq;
@@ -1061,7 +1086,7 @@ static enum print_line_t trace_special_hex(struct trace_iterator *iter,
 }
 
 static enum print_line_t trace_special_bin(struct trace_iterator *iter,
-                                          int flags)
+                                          int flags, struct trace_event *event)
 {
        struct special_entry *field;
        struct trace_seq *s = &iter->seq;
@@ -1075,18 +1100,22 @@ static enum print_line_t trace_special_bin(struct trace_iterator *iter,
        return TRACE_TYPE_HANDLED;
 }
 
-static struct trace_event trace_special_event = {
-       .type           = TRACE_SPECIAL,
+static struct trace_event_functions trace_special_funcs = {
        .trace          = trace_special_print,
        .raw            = trace_special_print,
        .hex            = trace_special_hex,
        .binary         = trace_special_bin,
 };
 
+static struct trace_event trace_special_event = {
+       .type           = TRACE_SPECIAL,
+       .funcs          = &trace_special_funcs,
+};
+
 /* TRACE_STACK */
 
 static enum print_line_t trace_stack_print(struct trace_iterator *iter,
-                                          int flags)
+                                          int flags, struct trace_event *event)
 {
        struct stack_entry *field;
        struct trace_seq *s = &iter->seq;
@@ -1114,17 +1143,21 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
        return TRACE_TYPE_PARTIAL_LINE;
 }
 
-static struct trace_event trace_stack_event = {
-       .type           = TRACE_STACK,
+static struct trace_event_functions trace_stack_funcs = {
        .trace          = trace_stack_print,
        .raw            = trace_special_print,
        .hex            = trace_special_hex,
        .binary         = trace_special_bin,
 };
 
+static struct trace_event trace_stack_event = {
+       .type           = TRACE_STACK,
+       .funcs          = &trace_stack_funcs,
+};
+
 /* TRACE_USER_STACK */
 static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
-                                               int flags)
+                                               int flags, struct trace_event *event)
 {
        struct userstack_entry *field;
        struct trace_seq *s = &iter->seq;
@@ -1143,17 +1176,22 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
        return TRACE_TYPE_PARTIAL_LINE;
 }
 
-static struct trace_event trace_user_stack_event = {
-       .type           = TRACE_USER_STACK,
+static struct trace_event_functions trace_user_stack_funcs = {
        .trace          = trace_user_stack_print,
        .raw            = trace_special_print,
        .hex            = trace_special_hex,
        .binary         = trace_special_bin,
 };
 
+static struct trace_event trace_user_stack_event = {
+       .type           = TRACE_USER_STACK,
+       .funcs          = &trace_user_stack_funcs,
+};
+
 /* TRACE_BPRINT */
 static enum print_line_t
-trace_bprint_print(struct trace_iterator *iter, int flags)
+trace_bprint_print(struct trace_iterator *iter, int flags,
+                  struct trace_event *event)
 {
        struct trace_entry *entry = iter->ent;
        struct trace_seq *s = &iter->seq;
@@ -1178,7 +1216,8 @@ trace_bprint_print(struct trace_iterator *iter, int flags)
 
 
 static enum print_line_t
-trace_bprint_raw(struct trace_iterator *iter, int flags)
+trace_bprint_raw(struct trace_iterator *iter, int flags,
+                struct trace_event *event)
 {
        struct bprint_entry *field;
        struct trace_seq *s = &iter->seq;
@@ -1197,16 +1236,19 @@ trace_bprint_raw(struct trace_iterator *iter, int flags)
        return TRACE_TYPE_PARTIAL_LINE;
 }
 
+static struct trace_event_functions trace_bprint_funcs = {
+       .trace          = trace_bprint_print,
+       .raw            = trace_bprint_raw,
+};
 
 static struct trace_event trace_bprint_event = {
        .type           = TRACE_BPRINT,
-       .trace          = trace_bprint_print,
-       .raw            = trace_bprint_raw,
+       .funcs          = &trace_bprint_funcs,
 };
 
 /* TRACE_PRINT */
 static enum print_line_t trace_print_print(struct trace_iterator *iter,
-                                          int flags)
+                                          int flags, struct trace_event *event)
 {
        struct print_entry *field;
        struct trace_seq *s = &iter->seq;
@@ -1225,7 +1267,8 @@ static enum print_line_t trace_print_print(struct trace_iterator *iter,
        return TRACE_TYPE_PARTIAL_LINE;
 }
 
-static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags,
+                                        struct trace_event *event)
 {
        struct print_entry *field;
 
@@ -1240,12 +1283,16 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags)
        return TRACE_TYPE_PARTIAL_LINE;
 }
 
-static struct trace_event trace_print_event = {
-       .type           = TRACE_PRINT,
+static struct trace_event_functions trace_print_funcs = {
        .trace          = trace_print_print,
        .raw            = trace_print_raw,
 };
 
+static struct trace_event trace_print_event = {
+       .type           = TRACE_PRINT,
+       .funcs          = &trace_print_funcs,
+};
+
 
 static struct trace_event *events[] __initdata = {
        &trace_fn_event,
index 9d91c72ba38b91c6aacc9011180eed5ef85fa2bd..c038eba0492ba182fd3276e177f8b0fe13d56a55 100644 (file)
@@ -25,7 +25,7 @@ extern void trace_event_read_unlock(void);
 extern struct trace_event *ftrace_find_event(int type);
 
 extern enum print_line_t trace_nop_print(struct trace_iterator *iter,
-                                        int flags);
+                                        int flags, struct trace_event *event);
 extern int
 trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry);
 
index 5fca0f51fde4ac27df4c733c6ebd827c34d661b2..8f758d070c43777d96fb85515d3161c1eae1c63e 100644 (file)
@@ -50,8 +50,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
 }
 
 static void
-probe_sched_switch(struct rq *__rq, struct task_struct *prev,
-                       struct task_struct *next)
+probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *next)
 {
        struct trace_array_cpu *data;
        unsigned long flags;
@@ -109,7 +108,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 }
 
 static void
-probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success)
+probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success)
 {
        struct trace_array_cpu *data;
        unsigned long flags;
@@ -139,21 +138,21 @@ static int tracing_sched_register(void)
 {
        int ret;
 
-       ret = register_trace_sched_wakeup(probe_sched_wakeup);
+       ret = register_trace_sched_wakeup(probe_sched_wakeup, NULL);
        if (ret) {
                pr_info("wakeup trace: Couldn't activate tracepoint"
                        " probe to kernel_sched_wakeup\n");
                return ret;
        }
 
-       ret = register_trace_sched_wakeup_new(probe_sched_wakeup);
+       ret = register_trace_sched_wakeup_new(probe_sched_wakeup, NULL);
        if (ret) {
                pr_info("wakeup trace: Couldn't activate tracepoint"
                        " probe to kernel_sched_wakeup_new\n");
                goto fail_deprobe;
        }
 
-       ret = register_trace_sched_switch(probe_sched_switch);
+       ret = register_trace_sched_switch(probe_sched_switch, NULL);
        if (ret) {
                pr_info("sched trace: Couldn't activate tracepoint"
                        " probe to kernel_sched_switch\n");
@@ -162,17 +161,17 @@ static int tracing_sched_register(void)
 
        return ret;
 fail_deprobe_wake_new:
-       unregister_trace_sched_wakeup_new(probe_sched_wakeup);
+       unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL);
 fail_deprobe:
-       unregister_trace_sched_wakeup(probe_sched_wakeup);
+       unregister_trace_sched_wakeup(probe_sched_wakeup, NULL);
        return ret;
 }
 
 static void tracing_sched_unregister(void)
 {
-       unregister_trace_sched_switch(probe_sched_switch);
-       unregister_trace_sched_wakeup_new(probe_sched_wakeup);
-       unregister_trace_sched_wakeup(probe_sched_wakeup);
+       unregister_trace_sched_switch(probe_sched_switch, NULL);
+       unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL);
+       unregister_trace_sched_wakeup(probe_sched_wakeup, NULL);
 }
 
 static void tracing_start_sched_switch(void)
index 0271742abb8d1188e34c19905c89a4cc169843e0..0e73bc2ef8c55a0b8a47ffe19d7b1aad3577de2b 100644 (file)
@@ -98,7 +98,8 @@ static int report_latency(cycle_t delta)
        return 1;
 }
 
-static void probe_wakeup_migrate_task(struct task_struct *task, int cpu)
+static void
+probe_wakeup_migrate_task(void *ignore, struct task_struct *task, int cpu)
 {
        if (task != wakeup_task)
                return;
@@ -107,8 +108,8 @@ static void probe_wakeup_migrate_task(struct task_struct *task, int cpu)
 }
 
 static void notrace
-probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
-       struct task_struct *next)
+probe_wakeup_sched_switch(void *ignore,
+                         struct task_struct *prev, struct task_struct *next)
 {
        struct trace_array_cpu *data;
        cycle_t T0, T1, delta;
@@ -200,7 +201,7 @@ static void wakeup_reset(struct trace_array *tr)
 }
 
 static void
-probe_wakeup(struct rq *rq, struct task_struct *p, int success)
+probe_wakeup(void *ignore, struct task_struct *p, int success)
 {
        struct trace_array_cpu *data;
        int cpu = smp_processor_id();
@@ -264,28 +265,28 @@ static void start_wakeup_tracer(struct trace_array *tr)
 {
        int ret;
 
-       ret = register_trace_sched_wakeup(probe_wakeup);
+       ret = register_trace_sched_wakeup(probe_wakeup, NULL);
        if (ret) {
                pr_info("wakeup trace: Couldn't activate tracepoint"
                        " probe to kernel_sched_wakeup\n");
                return;
        }
 
-       ret = register_trace_sched_wakeup_new(probe_wakeup);
+       ret = register_trace_sched_wakeup_new(probe_wakeup, NULL);
        if (ret) {
                pr_info("wakeup trace: Couldn't activate tracepoint"
                        " probe to kernel_sched_wakeup_new\n");
                goto fail_deprobe;
        }
 
-       ret = register_trace_sched_switch(probe_wakeup_sched_switch);
+       ret = register_trace_sched_switch(probe_wakeup_sched_switch, NULL);
        if (ret) {
                pr_info("sched trace: Couldn't activate tracepoint"
                        " probe to kernel_sched_switch\n");
                goto fail_deprobe_wake_new;
        }
 
-       ret = register_trace_sched_migrate_task(probe_wakeup_migrate_task);
+       ret = register_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL);
        if (ret) {
                pr_info("wakeup trace: Couldn't activate tracepoint"
                        " probe to kernel_sched_migrate_task\n");
@@ -312,19 +313,19 @@ static void start_wakeup_tracer(struct trace_array *tr)
 
        return;
 fail_deprobe_wake_new:
-       unregister_trace_sched_wakeup_new(probe_wakeup);
+       unregister_trace_sched_wakeup_new(probe_wakeup, NULL);
 fail_deprobe:
-       unregister_trace_sched_wakeup(probe_wakeup);
+       unregister_trace_sched_wakeup(probe_wakeup, NULL);
 }
 
 static void stop_wakeup_tracer(struct trace_array *tr)
 {
        tracer_enabled = 0;
        unregister_ftrace_function(&trace_ops);
-       unregister_trace_sched_switch(probe_wakeup_sched_switch);
-       unregister_trace_sched_wakeup_new(probe_wakeup);
-       unregister_trace_sched_wakeup(probe_wakeup);
-       unregister_trace_sched_migrate_task(probe_wakeup_migrate_task);
+       unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL);
+       unregister_trace_sched_wakeup_new(probe_wakeup, NULL);
+       unregister_trace_sched_wakeup(probe_wakeup, NULL);
+       unregister_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL);
 }
 
 static int __wakeup_tracer_init(struct trace_array *tr)
index 1cc9858258b33468627f0c9c787d8401480a152e..250e7f9bd2f0114c998d26c239fb3bfb535ebde2 100644 (file)
@@ -29,7 +29,7 @@ static int trace_test_buffer_cpu(struct trace_array *tr, int cpu)
        struct trace_entry *entry;
        unsigned int loops = 0;
 
-       while ((event = ring_buffer_consume(tr->buffer, cpu, NULL))) {
+       while ((event = ring_buffer_consume(tr->buffer, cpu, NULL, NULL))) {
                entry = ring_buffer_event_data(event);
 
                /*
@@ -255,7 +255,8 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
 /* Maximum number of functions to trace before diagnosing a hang */
 #define GRAPH_MAX_FUNC_TEST    100000000
 
-static void __ftrace_dump(bool disable_tracing);
+static void
+__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode);
 static unsigned int graph_hang_thresh;
 
 /* Wrap the real function entry probe to avoid possible hanging */
@@ -266,7 +267,7 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace)
                ftrace_graph_stop();
                printk(KERN_WARNING "BUG: Function graph tracer hang!\n");
                if (ftrace_dump_on_oops)
-                       __ftrace_dump(false);
+                       __ftrace_dump(false, DUMP_ALL);
                return 0;
        }
 
index 4d6d711717f2958a9c502f0e53bbb8a89ee41ff7..9d358301ae3eeea4503fc794a9127da87c94f3ad 100644 (file)
@@ -15,6 +15,54 @@ static int sys_refcount_exit;
 static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
 static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
 
+static int syscall_enter_register(struct ftrace_event_call *event,
+                                enum trace_reg type);
+static int syscall_exit_register(struct ftrace_event_call *event,
+                                enum trace_reg type);
+
+static int syscall_enter_define_fields(struct ftrace_event_call *call);
+static int syscall_exit_define_fields(struct ftrace_event_call *call);
+
+static struct list_head *
+syscall_get_enter_fields(struct ftrace_event_call *call)
+{
+       struct syscall_metadata *entry = call->data;
+
+       return &entry->enter_fields;
+}
+
+static struct list_head *
+syscall_get_exit_fields(struct ftrace_event_call *call)
+{
+       struct syscall_metadata *entry = call->data;
+
+       return &entry->exit_fields;
+}
+
+struct trace_event_functions enter_syscall_print_funcs = {
+       .trace                  = print_syscall_enter,
+};
+
+struct trace_event_functions exit_syscall_print_funcs = {
+       .trace                  = print_syscall_exit,
+};
+
+struct ftrace_event_class event_class_syscall_enter = {
+       .system                 = "syscalls",
+       .reg                    = syscall_enter_register,
+       .define_fields          = syscall_enter_define_fields,
+       .get_fields             = syscall_get_enter_fields,
+       .raw_init               = init_syscall_trace,
+};
+
+struct ftrace_event_class event_class_syscall_exit = {
+       .system                 = "syscalls",
+       .reg                    = syscall_exit_register,
+       .define_fields          = syscall_exit_define_fields,
+       .get_fields             = syscall_get_exit_fields,
+       .raw_init               = init_syscall_trace,
+};
+
 extern unsigned long __start_syscalls_metadata[];
 extern unsigned long __stop_syscalls_metadata[];
 
@@ -53,7 +101,8 @@ static struct syscall_metadata *syscall_nr_to_meta(int nr)
 }
 
 enum print_line_t
-print_syscall_enter(struct trace_iterator *iter, int flags)
+print_syscall_enter(struct trace_iterator *iter, int flags,
+                   struct trace_event *event)
 {
        struct trace_seq *s = &iter->seq;
        struct trace_entry *ent = iter->ent;
@@ -68,7 +117,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags)
        if (!entry)
                goto end;
 
-       if (entry->enter_event->id != ent->type) {
+       if (entry->enter_event->event.type != ent->type) {
                WARN_ON_ONCE(1);
                goto end;
        }
@@ -105,7 +154,8 @@ end:
 }
 
 enum print_line_t
-print_syscall_exit(struct trace_iterator *iter, int flags)
+print_syscall_exit(struct trace_iterator *iter, int flags,
+                  struct trace_event *event)
 {
        struct trace_seq *s = &iter->seq;
        struct trace_entry *ent = iter->ent;
@@ -123,7 +173,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags)
                return TRACE_TYPE_HANDLED;
        }
 
-       if (entry->exit_event->id != ent->type) {
+       if (entry->exit_event->event.type != ent->type) {
                WARN_ON_ONCE(1);
                return TRACE_TYPE_UNHANDLED;
        }
@@ -205,7 +255,7 @@ static void free_syscall_print_fmt(struct ftrace_event_call *call)
                kfree(call->print_fmt);
 }
 
-int syscall_enter_define_fields(struct ftrace_event_call *call)
+static int syscall_enter_define_fields(struct ftrace_event_call *call)
 {
        struct syscall_trace_enter trace;
        struct syscall_metadata *meta = call->data;
@@ -228,7 +278,7 @@ int syscall_enter_define_fields(struct ftrace_event_call *call)
        return ret;
 }
 
-int syscall_exit_define_fields(struct ftrace_event_call *call)
+static int syscall_exit_define_fields(struct ftrace_event_call *call)
 {
        struct syscall_trace_exit trace;
        int ret;
@@ -243,7 +293,7 @@ int syscall_exit_define_fields(struct ftrace_event_call *call)
        return ret;
 }
 
-void ftrace_syscall_enter(struct pt_regs *regs, long id)
+void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 {
        struct syscall_trace_enter *entry;
        struct syscall_metadata *sys_data;
@@ -265,7 +315,7 @@ void ftrace_syscall_enter(struct pt_regs *regs, long id)
        size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
 
        event = trace_current_buffer_lock_reserve(&buffer,
-                       sys_data->enter_event->id, size, 0, 0);
+                       sys_data->enter_event->event.type, size, 0, 0);
        if (!event)
                return;
 
@@ -278,7 +328,7 @@ void ftrace_syscall_enter(struct pt_regs *regs, long id)
                trace_current_buffer_unlock_commit(buffer, event, 0, 0);
 }
 
-void ftrace_syscall_exit(struct pt_regs *regs, long ret)
+void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 {
        struct syscall_trace_exit *entry;
        struct syscall_metadata *sys_data;
@@ -297,7 +347,7 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret)
                return;
 
        event = trace_current_buffer_lock_reserve(&buffer,
-                       sys_data->exit_event->id, sizeof(*entry), 0, 0);
+                       sys_data->exit_event->event.type, sizeof(*entry), 0, 0);
        if (!event)
                return;
 
@@ -320,7 +370,7 @@ int reg_event_syscall_enter(struct ftrace_event_call *call)
                return -ENOSYS;
        mutex_lock(&syscall_trace_lock);
        if (!sys_refcount_enter)
-               ret = register_trace_sys_enter(ftrace_syscall_enter);
+               ret = register_trace_sys_enter(ftrace_syscall_enter, NULL);
        if (!ret) {
                set_bit(num, enabled_enter_syscalls);
                sys_refcount_enter++;
@@ -340,7 +390,7 @@ void unreg_event_syscall_enter(struct ftrace_event_call *call)
        sys_refcount_enter--;
        clear_bit(num, enabled_enter_syscalls);
        if (!sys_refcount_enter)
-               unregister_trace_sys_enter(ftrace_syscall_enter);
+               unregister_trace_sys_enter(ftrace_syscall_enter, NULL);
        mutex_unlock(&syscall_trace_lock);
 }
 
@@ -354,7 +404,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call)
                return -ENOSYS;
        mutex_lock(&syscall_trace_lock);
        if (!sys_refcount_exit)
-               ret = register_trace_sys_exit(ftrace_syscall_exit);
+               ret = register_trace_sys_exit(ftrace_syscall_exit, NULL);
        if (!ret) {
                set_bit(num, enabled_exit_syscalls);
                sys_refcount_exit++;
@@ -374,7 +424,7 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call)
        sys_refcount_exit--;
        clear_bit(num, enabled_exit_syscalls);
        if (!sys_refcount_exit)
-               unregister_trace_sys_exit(ftrace_syscall_exit);
+               unregister_trace_sys_exit(ftrace_syscall_exit, NULL);
        mutex_unlock(&syscall_trace_lock);
 }
 
@@ -434,7 +484,7 @@ static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
 static int sys_perf_refcount_enter;
 static int sys_perf_refcount_exit;
 
-static void perf_syscall_enter(struct pt_regs *regs, long id)
+static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 {
        struct syscall_metadata *sys_data;
        struct syscall_trace_enter *rec;
@@ -461,7 +511,8 @@ static void perf_syscall_enter(struct pt_regs *regs, long id)
                return;
 
        rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
-                               sys_data->enter_event->id, &rctx, &flags);
+                               sys_data->enter_event->event.type,
+                               &rctx, &flags);
        if (!rec)
                return;
 
@@ -480,7 +531,7 @@ int perf_sysenter_enable(struct ftrace_event_call *call)
 
        mutex_lock(&syscall_trace_lock);
        if (!sys_perf_refcount_enter)
-               ret = register_trace_sys_enter(perf_syscall_enter);
+               ret = register_trace_sys_enter(perf_syscall_enter, NULL);
        if (ret) {
                pr_info("event trace: Could not activate"
                                "syscall entry trace point");
@@ -502,11 +553,11 @@ void perf_sysenter_disable(struct ftrace_event_call *call)
        sys_perf_refcount_enter--;
        clear_bit(num, enabled_perf_enter_syscalls);
        if (!sys_perf_refcount_enter)
-               unregister_trace_sys_enter(perf_syscall_enter);
+               unregister_trace_sys_enter(perf_syscall_enter, NULL);
        mutex_unlock(&syscall_trace_lock);
 }
 
-static void perf_syscall_exit(struct pt_regs *regs, long ret)
+static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 {
        struct syscall_metadata *sys_data;
        struct syscall_trace_exit *rec;
@@ -536,7 +587,8 @@ static void perf_syscall_exit(struct pt_regs *regs, long ret)
                return;
 
        rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
-                               sys_data->exit_event->id, &rctx, &flags);
+                               sys_data->exit_event->event.type,
+                               &rctx, &flags);
        if (!rec)
                return;
 
@@ -555,7 +607,7 @@ int perf_sysexit_enable(struct ftrace_event_call *call)
 
        mutex_lock(&syscall_trace_lock);
        if (!sys_perf_refcount_exit)
-               ret = register_trace_sys_exit(perf_syscall_exit);
+               ret = register_trace_sys_exit(perf_syscall_exit, NULL);
        if (ret) {
                pr_info("event trace: Could not activate"
                                "syscall exit trace point");
@@ -577,9 +629,50 @@ void perf_sysexit_disable(struct ftrace_event_call *call)
        sys_perf_refcount_exit--;
        clear_bit(num, enabled_perf_exit_syscalls);
        if (!sys_perf_refcount_exit)
-               unregister_trace_sys_exit(perf_syscall_exit);
+               unregister_trace_sys_exit(perf_syscall_exit, NULL);
        mutex_unlock(&syscall_trace_lock);
 }
 
 #endif /* CONFIG_PERF_EVENTS */
 
+static int syscall_enter_register(struct ftrace_event_call *event,
+                                enum trace_reg type)
+{
+       switch (type) {
+       case TRACE_REG_REGISTER:
+               return reg_event_syscall_enter(event);
+       case TRACE_REG_UNREGISTER:
+               unreg_event_syscall_enter(event);
+               return 0;
+
+#ifdef CONFIG_PERF_EVENTS
+       case TRACE_REG_PERF_REGISTER:
+               return perf_sysenter_enable(event);
+       case TRACE_REG_PERF_UNREGISTER:
+               perf_sysenter_disable(event);
+               return 0;
+#endif
+       }
+       return 0;
+}
+
+static int syscall_exit_register(struct ftrace_event_call *event,
+                                enum trace_reg type)
+{
+       switch (type) {
+       case TRACE_REG_REGISTER:
+               return reg_event_syscall_exit(event);
+       case TRACE_REG_UNREGISTER:
+               unreg_event_syscall_exit(event);
+               return 0;
+
+#ifdef CONFIG_PERF_EVENTS
+       case TRACE_REG_PERF_REGISTER:
+               return perf_sysexit_enable(event);
+       case TRACE_REG_PERF_UNREGISTER:
+               perf_sysexit_disable(event);
+               return 0;
+#endif
+       }
+       return 0;
+}
index cc2d2faa7d9e037734f1e4c11e87418c4a3a5400..a7cc3793baf6897d2535737a52322552040d2737 100644 (file)
@@ -49,7 +49,8 @@ static void cpu_workqueue_stat_free(struct kref *kref)
 
 /* Insertion of a work */
 static void
-probe_workqueue_insertion(struct task_struct *wq_thread,
+probe_workqueue_insertion(void *ignore,
+                         struct task_struct *wq_thread,
                          struct work_struct *work)
 {
        int cpu = cpumask_first(&wq_thread->cpus_allowed);
@@ -70,7 +71,8 @@ found:
 
 /* Execution of a work */
 static void
-probe_workqueue_execution(struct task_struct *wq_thread,
+probe_workqueue_execution(void *ignore,
+                         struct task_struct *wq_thread,
                          struct work_struct *work)
 {
        int cpu = cpumask_first(&wq_thread->cpus_allowed);
@@ -90,7 +92,8 @@ found:
 }
 
 /* Creation of a cpu workqueue thread */
-static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu)
+static void probe_workqueue_creation(void *ignore,
+                                    struct task_struct *wq_thread, int cpu)
 {
        struct cpu_workqueue_stats *cws;
        unsigned long flags;
@@ -114,7 +117,8 @@ static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu)
 }
 
 /* Destruction of a cpu workqueue thread */
-static void probe_workqueue_destruction(struct task_struct *wq_thread)
+static void
+probe_workqueue_destruction(void *ignore, struct task_struct *wq_thread)
 {
        /* Workqueue only execute on one cpu */
        int cpu = cpumask_first(&wq_thread->cpus_allowed);
@@ -259,19 +263,19 @@ int __init trace_workqueue_early_init(void)
 {
        int ret, cpu;
 
-       ret = register_trace_workqueue_insertion(probe_workqueue_insertion);
+       ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL);
        if (ret)
                goto out;
 
-       ret = register_trace_workqueue_execution(probe_workqueue_execution);
+       ret = register_trace_workqueue_execution(probe_workqueue_execution, NULL);
        if (ret)
                goto no_insertion;
 
-       ret = register_trace_workqueue_creation(probe_workqueue_creation);
+       ret = register_trace_workqueue_creation(probe_workqueue_creation, NULL);
        if (ret)
                goto no_execution;
 
-       ret = register_trace_workqueue_destruction(probe_workqueue_destruction);
+       ret = register_trace_workqueue_destruction(probe_workqueue_destruction, NULL);
        if (ret)
                goto no_creation;
 
@@ -283,11 +287,11 @@ int __init trace_workqueue_early_init(void)
        return 0;
 
 no_creation:
-       unregister_trace_workqueue_creation(probe_workqueue_creation);
+       unregister_trace_workqueue_creation(probe_workqueue_creation, NULL);
 no_execution:
-       unregister_trace_workqueue_execution(probe_workqueue_execution);
+       unregister_trace_workqueue_execution(probe_workqueue_execution, NULL);
 no_insertion:
-       unregister_trace_workqueue_insertion(probe_workqueue_insertion);
+       unregister_trace_workqueue_insertion(probe_workqueue_insertion, NULL);
 out:
        pr_warning("trace_workqueue: unable to trace workqueues\n");
 
index cc89be5bc0f8ef3d8776ed0da0cf021cf6a35003..c77f3eceea250e49b0ff001959f8df9eeb8e0716 100644 (file)
@@ -54,7 +54,7 @@ static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE];
  */
 struct tracepoint_entry {
        struct hlist_node hlist;
-       void **funcs;
+       struct tracepoint_func *funcs;
        int refcount;   /* Number of times armed. 0 if disarmed. */
        char name[0];
 };
@@ -64,12 +64,12 @@ struct tp_probes {
                struct rcu_head rcu;
                struct list_head list;
        } u;
-       void *probes[0];
+       struct tracepoint_func probes[0];
 };
 
 static inline void *allocate_probes(int count)
 {
-       struct tp_probes *p  = kmalloc(count * sizeof(void *)
+       struct tp_probes *p  = kmalloc(count * sizeof(struct tracepoint_func)
                        + sizeof(struct tp_probes), GFP_KERNEL);
        return p == NULL ? NULL : p->probes;
 }
@@ -79,7 +79,7 @@ static void rcu_free_old_probes(struct rcu_head *head)
        kfree(container_of(head, struct tp_probes, u.rcu));
 }
 
-static inline void release_probes(void *old)
+static inline void release_probes(struct tracepoint_func *old)
 {
        if (old) {
                struct tp_probes *tp_probes = container_of(old,
@@ -95,15 +95,16 @@ static void debug_print_probes(struct tracepoint_entry *entry)
        if (!tracepoint_debug || !entry->funcs)
                return;
 
-       for (i = 0; entry->funcs[i]; i++)
-               printk(KERN_DEBUG "Probe %d : %p\n", i, entry->funcs[i]);
+       for (i = 0; entry->funcs[i].func; i++)
+               printk(KERN_DEBUG "Probe %d : %p\n", i, entry->funcs[i].func);
 }
 
-static void *
-tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe)
+static struct tracepoint_func *
+tracepoint_entry_add_probe(struct tracepoint_entry *entry,
+                          void *probe, void *data)
 {
        int nr_probes = 0;
-       void **old, **new;
+       struct tracepoint_func *old, *new;
 
        WARN_ON(!probe);
 
@@ -111,8 +112,9 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe)
        old = entry->funcs;
        if (old) {
                /* (N -> N+1), (N != 0, 1) probes */
-               for (nr_probes = 0; old[nr_probes]; nr_probes++)
-                       if (old[nr_probes] == probe)
+               for (nr_probes = 0; old[nr_probes].func; nr_probes++)
+                       if (old[nr_probes].func == probe &&
+                           old[nr_probes].data == data)
                                return ERR_PTR(-EEXIST);
        }
        /* + 2 : one for new probe, one for NULL func */
@@ -120,9 +122,10 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe)
        if (new == NULL)
                return ERR_PTR(-ENOMEM);
        if (old)
-               memcpy(new, old, nr_probes * sizeof(void *));
-       new[nr_probes] = probe;
-       new[nr_probes + 1] = NULL;
+               memcpy(new, old, nr_probes * sizeof(struct tracepoint_func));
+       new[nr_probes].func = probe;
+       new[nr_probes].data = data;
+       new[nr_probes + 1].func = NULL;
        entry->refcount = nr_probes + 1;
        entry->funcs = new;
        debug_print_probes(entry);
@@ -130,10 +133,11 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe)
 }
 
 static void *
-tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe)
+tracepoint_entry_remove_probe(struct tracepoint_entry *entry,
+                             void *probe, void *data)
 {
        int nr_probes = 0, nr_del = 0, i;
-       void **old, **new;
+       struct tracepoint_func *old, *new;
 
        old = entry->funcs;
 
@@ -142,8 +146,10 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe)
 
        debug_print_probes(entry);
        /* (N -> M), (N > 1, M >= 0) probes */
-       for (nr_probes = 0; old[nr_probes]; nr_probes++) {
-               if ((!probe || old[nr_probes] == probe))
+       for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
+               if (!probe ||
+                   (old[nr_probes].func == probe &&
+                    old[nr_probes].data == data))
                        nr_del++;
        }
 
@@ -160,10 +166,11 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe)
                new = allocate_probes(nr_probes - nr_del + 1);
                if (new == NULL)
                        return ERR_PTR(-ENOMEM);
-               for (i = 0; old[i]; i++)
-                       if ((probe && old[i] != probe))
+               for (i = 0; old[i].func; i++)
+                       if (probe &&
+                           (old[i].func != probe || old[i].data != data))
                                new[j++] = old[i];
-               new[nr_probes - nr_del] = NULL;
+               new[nr_probes - nr_del].func = NULL;
                entry->refcount = nr_probes - nr_del;
                entry->funcs = new;
        }
@@ -315,18 +322,19 @@ static void tracepoint_update_probes(void)
        module_update_tracepoints();
 }
 
-static void *tracepoint_add_probe(const char *name, void *probe)
+static struct tracepoint_func *
+tracepoint_add_probe(const char *name, void *probe, void *data)
 {
        struct tracepoint_entry *entry;
-       void *old;
+       struct tracepoint_func *old;
 
        entry = get_tracepoint(name);
        if (!entry) {
                entry = add_tracepoint(name);
                if (IS_ERR(entry))
-                       return entry;
+                       return (struct tracepoint_func *)entry;
        }
-       old = tracepoint_entry_add_probe(entry, probe);
+       old = tracepoint_entry_add_probe(entry, probe, data);
        if (IS_ERR(old) && !entry->refcount)
                remove_tracepoint(entry);
        return old;
@@ -340,12 +348,12 @@ static void *tracepoint_add_probe(const char *name, void *probe)
  * Returns 0 if ok, error value on error.
  * The probe address must at least be aligned on the architecture pointer size.
  */
-int tracepoint_probe_register(const char *name, void *probe)
+int tracepoint_probe_register(const char *name, void *probe, void *data)
 {
-       void *old;
+       struct tracepoint_func *old;
 
        mutex_lock(&tracepoints_mutex);
-       old = tracepoint_add_probe(name, probe);
+       old = tracepoint_add_probe(name, probe, data);
        mutex_unlock(&tracepoints_mutex);
        if (IS_ERR(old))
                return PTR_ERR(old);
@@ -356,15 +364,16 @@ int tracepoint_probe_register(const char *name, void *probe)
 }
 EXPORT_SYMBOL_GPL(tracepoint_probe_register);
 
-static void *tracepoint_remove_probe(const char *name, void *probe)
+static struct tracepoint_func *
+tracepoint_remove_probe(const char *name, void *probe, void *data)
 {
        struct tracepoint_entry *entry;
-       void *old;
+       struct tracepoint_func *old;
 
        entry = get_tracepoint(name);
        if (!entry)
                return ERR_PTR(-ENOENT);
-       old = tracepoint_entry_remove_probe(entry, probe);
+       old = tracepoint_entry_remove_probe(entry, probe, data);
        if (IS_ERR(old))
                return old;
        if (!entry->refcount)
@@ -382,12 +391,12 @@ static void *tracepoint_remove_probe(const char *name, void *probe)
  * itself uses stop_machine(), which insures that every preempt disabled section
  * have finished.
  */
-int tracepoint_probe_unregister(const char *name, void *probe)
+int tracepoint_probe_unregister(const char *name, void *probe, void *data)
 {
-       void *old;
+       struct tracepoint_func *old;
 
        mutex_lock(&tracepoints_mutex);
-       old = tracepoint_remove_probe(name, probe);
+       old = tracepoint_remove_probe(name, probe, data);
        mutex_unlock(&tracepoints_mutex);
        if (IS_ERR(old))
                return PTR_ERR(old);
@@ -418,12 +427,13 @@ static void tracepoint_add_old_probes(void *old)
  *
  * caller must call tracepoint_probe_update_all()
  */
-int tracepoint_probe_register_noupdate(const char *name, void *probe)
+int tracepoint_probe_register_noupdate(const char *name, void *probe,
+                                      void *data)
 {
-       void *old;
+       struct tracepoint_func *old;
 
        mutex_lock(&tracepoints_mutex);
-       old = tracepoint_add_probe(name, probe);
+       old = tracepoint_add_probe(name, probe, data);
        if (IS_ERR(old)) {
                mutex_unlock(&tracepoints_mutex);
                return PTR_ERR(old);
@@ -441,12 +451,13 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_register_noupdate);
  *
  * caller must call tracepoint_probe_update_all()
  */
-int tracepoint_probe_unregister_noupdate(const char *name, void *probe)
+int tracepoint_probe_unregister_noupdate(const char *name, void *probe,
+                                        void *data)
 {
-       void *old;
+       struct tracepoint_func *old;
 
        mutex_lock(&tracepoints_mutex);
-       old = tracepoint_remove_probe(name, probe);
+       old = tracepoint_remove_probe(name, probe, data);
        if (IS_ERR(old)) {
                mutex_unlock(&tracepoints_mutex);
                return PTR_ERR(old);
index 766467b3bcb7f1e42da792bece602ec8404135a2..7e72614b736d82dd108d99ab79daefa55e6cdfe3 100644 (file)
@@ -16,7 +16,6 @@
 #include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/user_namespace.h>
-#include "cred-internals.h"
 
 struct user_namespace init_user_ns = {
        .kref = {
@@ -137,9 +136,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
        struct hlist_head *hashent = uidhashentry(ns, uid);
        struct user_struct *up, *new;
 
-       /* Make uid_hash_find() + uids_user_create() + uid_hash_insert()
-        * atomic.
-        */
        spin_lock_irq(&uidhash_lock);
        up = uid_hash_find(uid, hashent);
        spin_unlock_irq(&uidhash_lock);
@@ -161,11 +157,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
                spin_lock_irq(&uidhash_lock);
                up = uid_hash_find(uid, hashent);
                if (up) {
-                       /* This case is not possible when CONFIG_USER_SCHED
-                        * is defined, since we serialize alloc_uid() using
-                        * uids_mutex. Hence no need to call
-                        * sched_destroy_user() or remove_user_sysfs_dir().
-                        */
                        key_put(new->uid_keyring);
                        key_put(new->session_keyring);
                        kmem_cache_free(uid_cachep, new);
@@ -178,8 +169,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
 
        return up;
 
-       put_user_ns(new->user_ns);
-       kmem_cache_free(uid_cachep, new);
 out_unlock:
        return NULL;
 }
index cf208d8042b198d962a8ccfbd81c0b70acca28c2..ad41529fb60f766ad095de5c40b9d537473314f1 100644 (file)
@@ -172,12 +172,12 @@ out:
        return;
 }
 
-static void trace_kfree_skb_hit(struct sk_buff *skb, void *location)
+static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, void *location)
 {
        trace_drop_common(skb, location);
 }
 
-static void trace_napi_poll_hit(struct napi_struct *napi)
+static void trace_napi_poll_hit(void *ignore, struct napi_struct *napi)
 {
        struct dm_hw_stat_delta *new_stat;
 
@@ -225,12 +225,12 @@ static int set_all_monitor_traces(int state)
 
        switch (state) {
        case TRACE_ON:
-               rc |= register_trace_kfree_skb(trace_kfree_skb_hit);
-               rc |= register_trace_napi_poll(trace_napi_poll_hit);
+               rc |= register_trace_kfree_skb(trace_kfree_skb_hit, NULL);
+               rc |= register_trace_napi_poll(trace_napi_poll_hit, NULL);
                break;
        case TRACE_OFF:
-               rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit);
-               rc |= unregister_trace_napi_poll(trace_napi_poll_hit);
+               rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit, NULL);
+               rc |= unregister_trace_napi_poll(trace_napi_poll_hit, NULL);
 
                tracepoint_synchronize_unregister();
 
index dffdc49878af6532104307430155f4aeb44bf538..4d46be965961fd4f2f6d745243f38b87a43dfd51 100644 (file)
@@ -7,7 +7,5 @@
 DECLARE_TRACE(subsys_event,
        TP_PROTO(struct inode *inode, struct file *file),
        TP_ARGS(inode, file));
-DECLARE_TRACE(subsys_eventb,
-       TP_PROTO(void),
-       TP_ARGS());
+DECLARE_TRACE_NOARGS(subsys_eventb);
 #endif
index 9e60eb6ca2d8a691f4c01adc3592be40eb7e5824..744c0b9652a7815407b95ecbb8ca6553c5e900c0 100644 (file)
@@ -13,7 +13,8 @@
  * Here the caller only guarantees locking for struct file and struct inode.
  * Locking must therefore be done in the probe to use the dentry.
  */
-static void probe_subsys_event(struct inode *inode, struct file *file)
+static void probe_subsys_event(void *ignore,
+                              struct inode *inode, struct file *file)
 {
        path_get(&file->f_path);
        dget(file->f_path.dentry);
@@ -23,7 +24,7 @@ static void probe_subsys_event(struct inode *inode, struct file *file)
        path_put(&file->f_path);
 }
 
-static void probe_subsys_eventb(void)
+static void probe_subsys_eventb(void *ignore)
 {
        printk(KERN_INFO "Event B is encountered\n");
 }
@@ -32,9 +33,9 @@ static int __init tp_sample_trace_init(void)
 {
        int ret;
 
-       ret = register_trace_subsys_event(probe_subsys_event);
+       ret = register_trace_subsys_event(probe_subsys_event, NULL);
        WARN_ON(ret);
-       ret = register_trace_subsys_eventb(probe_subsys_eventb);
+       ret = register_trace_subsys_eventb(probe_subsys_eventb, NULL);
        WARN_ON(ret);
 
        return 0;
@@ -44,8 +45,8 @@ module_init(tp_sample_trace_init);
 
 static void __exit tp_sample_trace_exit(void)
 {
-       unregister_trace_subsys_eventb(probe_subsys_eventb);
-       unregister_trace_subsys_event(probe_subsys_event);
+       unregister_trace_subsys_eventb(probe_subsys_eventb, NULL);
+       unregister_trace_subsys_event(probe_subsys_event, NULL);
        tracepoint_synchronize_unregister();
 }
 
index be2a960573f174991fe26c4ccdfd5794bc61c988..9fcf990e5d4bd01c7095471105b2260028be1919 100644 (file)
@@ -12,7 +12,8 @@
  * Here the caller only guarantees locking for struct file and struct inode.
  * Locking must therefore be done in the probe to use the dentry.
  */
-static void probe_subsys_event(struct inode *inode, struct file *file)
+static void probe_subsys_event(void *ignore,
+                              struct inode *inode, struct file *file)
 {
        printk(KERN_INFO "Event is encountered with inode number %lu\n",
                inode->i_ino);
@@ -22,7 +23,7 @@ static int __init tp_sample_trace_init(void)
 {
        int ret;
 
-       ret = register_trace_subsys_event(probe_subsys_event);
+       ret = register_trace_subsys_event(probe_subsys_event, NULL);
        WARN_ON(ret);
 
        return 0;
@@ -32,7 +33,7 @@ module_init(tp_sample_trace_init);
 
 static void __exit tp_sample_trace_exit(void)
 {
-       unregister_trace_subsys_event(probe_subsys_event);
+       unregister_trace_subsys_event(probe_subsys_event, NULL);
        tracepoint_synchronize_unregister();
 }
 
This page took 0.179924 seconds and 5 git commands to generate.