2 * kerneltop.c: show top kernel functions - performance counters showcase
6 cc -O6 -Wall -lrt `pkg-config --cflags --libs glib-2.0` -o kerneltop kerneltop.c
10 ------------------------------------------------------------------------------
11 KernelTop: 2669 irqs/sec [NMI, cache-misses/cache-refs], (all, cpu: 2)
12 ------------------------------------------------------------------------------
14 weight RIP kernel function
15 ______ ________________ _______________
17 35.20 - ffffffff804ce74b : skb_copy_and_csum_dev
18 33.00 - ffffffff804cb740 : sock_alloc_send_skb
19 31.26 - ffffffff804ce808 : skb_push
20 22.43 - ffffffff80510004 : tcp_established_options
21 19.00 - ffffffff8027d250 : find_get_page
22 15.76 - ffffffff804e4fc9 : eth_type_trans
23 15.20 - ffffffff804d8baa : dst_release
24 14.86 - ffffffff804cf5d8 : skb_release_head_state
25 14.00 - ffffffff802217d5 : read_hpet
26 12.00 - ffffffff804ffb7f : __ip_local_out
27 11.97 - ffffffff804fc0c8 : ip_local_deliver_finish
28 8.54 - ffffffff805001a3 : ip_queue_xmit
32 * perfstat: /usr/bin/time -alike performance counter statistics utility
34 It summarizes the counter events of all tasks (and child tasks),
35 covering all CPUs that the command (or workload) executes on.
36 It only counts the per-task events of the workload started,
37 independent of how many other tasks run on those CPUs.
41 $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null
43 Performance counter stats for 'ls':
45 163516953 instructions
51 * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
53 * Improvements and fixes by:
55 * Arjan van de Ven <arjan@linux.intel.com>
56 * Yanmin Zhang <yanmin.zhang@intel.com>
57 * Wu Fengguang <fengguang.wu@intel.com>
58 * Mike Galbraith <efault@gmx.de>
60 * Released under the GPL v2. (and only v2, not any later version)
64 #include <sys/types.h>
81 #include <sys/syscall.h>
82 #include <sys/ioctl.h>
84 #include <sys/prctl.h>
89 #include <linux/unistd.h>
91 #include "../../include/linux/perf_counter.h"
95 * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
96 * counters in the current task.
98 #define PR_TASK_PERF_COUNTERS_DISABLE 31
99 #define PR_TASK_PERF_COUNTERS_ENABLE 32
101 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
105 struct timespec ts; \
107 clock_gettime(CLOCK_MONOTONIC, &ts); \
108 ts.tv_sec * 1000000000ULL + ts.tv_nsec; \
112 * Pick up some kernel type conventions:
117 typedef unsigned int __u32
;
118 typedef unsigned long long __u64
;
119 typedef long long __s64
;
123 #define __NR_perf_counter_open 295
124 #define rmb() asm volatile("lfence" ::: "memory")
125 #define cpu_relax() asm volatile("rep; nop" ::: "memory");
129 #define __NR_perf_counter_open 333
130 #define rmb() asm volatile("lfence" ::: "memory")
131 #define cpu_relax() asm volatile("rep; nop" ::: "memory");
135 #define __NR_perf_counter_open 319
136 #define rmb() asm volatile ("sync" ::: "memory")
137 #define cpu_relax() asm volatile ("" ::: "memory");
140 #define unlikely(x) __builtin_expect(!!(x), 0)
142 asmlinkage
int sys_perf_counter_open(
143 struct perf_counter_hw_event
*hw_event_uptr __user
,
152 __NR_perf_counter_open
, hw_event_uptr
, pid
, cpu
, group_fd
, flags
);
153 #if defined(__x86_64__) || defined(__i386__)
154 if (ret
< 0 && ret
> -4096) {
162 #define MAX_COUNTERS 64
163 #define MAX_NR_CPUS 256
165 #define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id))
167 static int run_perfstat
= 0;
168 static int system_wide
= 0;
170 static int nr_counters
= 0;
171 static __u64 event_id
[MAX_COUNTERS
] = {
172 EID(PERF_TYPE_SOFTWARE
, PERF_COUNT_TASK_CLOCK
),
173 EID(PERF_TYPE_SOFTWARE
, PERF_COUNT_CPU_MIGRATIONS
),
174 EID(PERF_TYPE_SOFTWARE
, PERF_COUNT_CPU_MIGRATIONS
),
175 EID(PERF_TYPE_SOFTWARE
, PERF_COUNT_PAGE_FAULTS
),
177 EID(PERF_TYPE_HARDWARE
, PERF_COUNT_CPU_CYCLES
),
178 EID(PERF_TYPE_HARDWARE
, PERF_COUNT_INSTRUCTIONS
),
179 EID(PERF_TYPE_HARDWARE
, PERF_COUNT_CACHE_REFERENCES
),
180 EID(PERF_TYPE_HARDWARE
, PERF_COUNT_CACHE_MISSES
),
182 static int default_interval
= 100000;
183 static int event_count
[MAX_COUNTERS
];
184 static int fd
[MAX_NR_CPUS
][MAX_COUNTERS
];
186 static __u64 count_filter
= 100;
189 static int profile_cpu
= -1;
190 static int nr_cpus
= 0;
192 static int group
= 0;
193 static unsigned int page_size
;
195 static char *vmlinux
;
197 static char *sym_filter
;
198 static unsigned long filter_start
;
199 static unsigned long filter_end
;
201 static int delay_secs
= 2;
203 static int dump_symtab
;
214 const unsigned int default_count
[] = {
223 static char *hw_event_names
[] = {
233 static char *sw_event_names
[] = {
243 struct event_symbol
{
248 static struct event_symbol event_symbols
[] = {
249 {EID(PERF_TYPE_HARDWARE
, PERF_COUNT_CPU_CYCLES
), "cpu-cycles", },
250 {EID(PERF_TYPE_HARDWARE
, PERF_COUNT_CPU_CYCLES
), "cycles", },
251 {EID(PERF_TYPE_HARDWARE
, PERF_COUNT_INSTRUCTIONS
), "instructions", },
252 {EID(PERF_TYPE_HARDWARE
, PERF_COUNT_CACHE_REFERENCES
), "cache-references", },
253 {EID(PERF_TYPE_HARDWARE
, PERF_COUNT_CACHE_MISSES
), "cache-misses", },
254 {EID(PERF_TYPE_HARDWARE
, PERF_COUNT_BRANCH_INSTRUCTIONS
), "branch-instructions", },
255 {EID(PERF_TYPE_HARDWARE
, PERF_COUNT_BRANCH_INSTRUCTIONS
), "branches", },
256 {EID(PERF_TYPE_HARDWARE
, PERF_COUNT_BRANCH_MISSES
), "branch-misses", },
257 {EID(PERF_TYPE_HARDWARE
, PERF_COUNT_BUS_CYCLES
), "bus-cycles", },
259 {EID(PERF_TYPE_SOFTWARE
, PERF_COUNT_CPU_CLOCK
), "cpu-clock", },
260 {EID(PERF_TYPE_SOFTWARE
, PERF_COUNT_TASK_CLOCK
), "task-clock", },
261 {EID(PERF_TYPE_SOFTWARE
, PERF_COUNT_PAGE_FAULTS
), "page-faults", },
262 {EID(PERF_TYPE_SOFTWARE
, PERF_COUNT_PAGE_FAULTS
), "faults", },
263 {EID(PERF_TYPE_SOFTWARE
, PERF_COUNT_PAGE_FAULTS_MIN
), "minor-faults", },
264 {EID(PERF_TYPE_SOFTWARE
, PERF_COUNT_PAGE_FAULTS_MAJ
), "major-faults", },
265 {EID(PERF_TYPE_SOFTWARE
, PERF_COUNT_CONTEXT_SWITCHES
), "context-switches", },
266 {EID(PERF_TYPE_SOFTWARE
, PERF_COUNT_CONTEXT_SWITCHES
), "cs", },
267 {EID(PERF_TYPE_SOFTWARE
, PERF_COUNT_CPU_MIGRATIONS
), "cpu-migrations", },
268 {EID(PERF_TYPE_SOFTWARE
, PERF_COUNT_CPU_MIGRATIONS
), "migrations", },
271 #define __PERF_COUNTER_FIELD(config, name) \
272 ((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
274 #define PERF_COUNTER_RAW(config) __PERF_COUNTER_FIELD(config, RAW)
275 #define PERF_COUNTER_CONFIG(config) __PERF_COUNTER_FIELD(config, CONFIG)
276 #define PERF_COUNTER_TYPE(config) __PERF_COUNTER_FIELD(config, TYPE)
277 #define PERF_COUNTER_ID(config) __PERF_COUNTER_FIELD(config, EVENT)
279 static void display_events_help(void)
285 " -e EVENT --event=EVENT # symbolic-name abbreviations");
287 for (i
= 0; i
< ARRAY_SIZE(event_symbols
); i
++) {
290 e
= event_symbols
[i
].event
;
291 type
= PERF_COUNTER_TYPE(e
);
292 id
= PERF_COUNTER_ID(e
);
294 printf("\n %d:%d: %-20s",
295 type
, id
, event_symbols
[i
].symbol
);
299 " rNNN: raw PMU events (eventsel+umask)\n\n");
302 static void display_perfstat_help(void)
305 "Usage: perfstat [<events...>] <cmd...>\n\n"
306 "PerfStat Options (up to %d event types can be specified):\n\n",
309 display_events_help();
312 " -a # system-wide collection\n");
316 static void display_help(void)
319 return display_perfstat_help();
322 "Usage: kerneltop [<options>]\n"
323 " Or: kerneltop -S [<options>] COMMAND [ARGS]\n\n"
324 "KernelTop Options (up to %d event types can be specified at once):\n\n",
327 display_events_help();
330 " -S --stat # perfstat COMMAND\n"
331 " -a # system-wide collection (for perfstat)\n\n"
332 " -c CNT --count=CNT # event period to sample\n\n"
333 " -C CPU --cpu=CPU # CPU (-1 for all) [default: -1]\n"
334 " -p PID --pid=PID # PID of sampled task (-1 for all) [default: -1]\n\n"
335 " -d delay --delay=<seconds> # sampling/display delay [default: 2]\n"
336 " -f CNT --filter=CNT # min-event-count filter [default: 100]\n\n"
337 " -s symbol --symbol=<symbol> # function to be showed annotated one-shot\n"
338 " -x path --vmlinux=<path> # the vmlinux binary, required for -s use\n"
339 " -z --zero # zero counts after display\n"
340 " -D --dump_symtab # dump symbol table to stderr on startup\n"
346 static char *event_name(int ctr
)
348 __u64 config
= event_id
[ctr
];
349 int type
= PERF_COUNTER_TYPE(config
);
350 int id
= PERF_COUNTER_ID(config
);
353 if (PERF_COUNTER_RAW(config
)) {
354 sprintf(buf
, "raw 0x%llx", PERF_COUNTER_CONFIG(config
));
359 case PERF_TYPE_HARDWARE
:
360 if (id
< PERF_HW_EVENTS_MAX
)
361 return hw_event_names
[id
];
362 return "unknown-hardware";
364 case PERF_TYPE_SOFTWARE
:
365 if (id
< PERF_SW_EVENTS_MAX
)
366 return sw_event_names
[id
];
367 return "unknown-software";
377 * Each event can have multiple symbolic names.
378 * Symbolic names are (almost) exactly matched.
380 static __u64
match_event_symbols(char *str
)
386 if (sscanf(str
, "r%llx", &config
) == 1)
387 return config
| PERF_COUNTER_RAW_MASK
;
389 if (sscanf(str
, "%d:%llu", &type
, &id
) == 2)
390 return EID(type
, id
);
392 for (i
= 0; i
< ARRAY_SIZE(event_symbols
); i
++) {
393 if (!strncmp(str
, event_symbols
[i
].symbol
,
394 strlen(event_symbols
[i
].symbol
)))
395 return event_symbols
[i
].event
;
401 static int parse_events(char *str
)
406 if (nr_counters
== MAX_COUNTERS
)
409 config
= match_event_symbols(str
);
413 event_id
[nr_counters
] = config
;
416 str
= strstr(str
, ",");
430 char fault_here
[1000000];
432 static void create_perfstat_counter(int counter
)
434 struct perf_counter_hw_event hw_event
;
436 memset(&hw_event
, 0, sizeof(hw_event
));
437 hw_event
.config
= event_id
[counter
];
438 hw_event
.record_type
= PERF_RECORD_SIMPLE
;
443 for (cpu
= 0; cpu
< nr_cpus
; cpu
++) {
444 fd
[cpu
][counter
] = sys_perf_counter_open(&hw_event
, -1, cpu
, -1, 0);
445 if (fd
[cpu
][counter
] < 0) {
446 printf("perfstat error: syscall returned with %d (%s)\n",
447 fd
[cpu
][counter
], strerror(errno
));
452 hw_event
.inherit
= 1;
453 hw_event
.disabled
= 1;
455 fd
[0][counter
] = sys_perf_counter_open(&hw_event
, 0, -1, -1, 0);
456 if (fd
[0][counter
] < 0) {
457 printf("perfstat error: syscall returned with %d (%s)\n",
458 fd
[0][counter
], strerror(errno
));
464 int do_perfstat(int argc
, char *argv
[])
466 unsigned long long t0
, t1
;
475 for (counter
= 0; counter
< nr_counters
; counter
++)
476 create_perfstat_counter(counter
);
485 * Enable counters and exec the command:
488 prctl(PR_TASK_PERF_COUNTERS_ENABLE
);
490 if ((pid
= fork()) < 0)
491 perror("failed to fork");
493 if (execvp(argv
[0], argv
)) {
498 while (wait(&status
) >= 0)
500 prctl(PR_TASK_PERF_COUNTERS_DISABLE
);
505 fprintf(stderr
, "\n");
506 fprintf(stderr
, " Performance counter stats for \'%s\':\n",
508 fprintf(stderr
, "\n");
510 for (counter
= 0; counter
< nr_counters
; counter
++) {
512 __u64 count
, single_count
;
515 for (cpu
= 0; cpu
< nr_cpus
; cpu
++) {
516 res
= read(fd
[cpu
][counter
],
517 (char *) &single_count
, sizeof(single_count
));
518 assert(res
== sizeof(single_count
));
519 count
+= single_count
;
522 if (!PERF_COUNTER_RAW(event_id
[counter
]) &&
523 (event_id
[counter
] == PERF_COUNT_CPU_CLOCK
||
524 event_id
[counter
] == PERF_COUNT_TASK_CLOCK
)) {
526 double msecs
= (double)count
/ 1000000;
528 fprintf(stderr
, " %14.6f %-20s (msecs)\n",
529 msecs
, event_name(counter
));
531 fprintf(stderr
, " %14Ld %-20s (events)\n",
532 count
, event_name(counter
));
535 fprintf(stderr
, "\n");
537 fprintf(stderr
, "\n");
538 fprintf(stderr
, " Wall-clock time elapsed: %12.6f msecs\n",
539 (double)(t1
-t0
)/1e6
);
540 fprintf(stderr
, "\n");
549 static uint64_t min_ip
;
550 static uint64_t max_ip
= -1ll;
553 unsigned long long addr
;
555 unsigned long count
[MAX_COUNTERS
];
560 #define MAX_SYMS 100000
562 static int sym_table_count
;
564 struct sym_entry
*sym_filter_entry
;
566 static struct sym_entry sym_table
[MAX_SYMS
];
568 static void show_details(struct sym_entry
*sym
);
571 * Ordering weight: count-1 * count-2 * ... / count-n
573 static double sym_weight(const struct sym_entry
*sym
)
578 weight
= sym
->count
[0];
580 for (counter
= 1; counter
< nr_counters
-1; counter
++)
581 weight
*= sym
->count
[counter
];
583 weight
/= (sym
->count
[counter
] + 1);
588 static int compare(const void *__sym1
, const void *__sym2
)
590 const struct sym_entry
*sym1
= __sym1
, *sym2
= __sym2
;
592 return sym_weight(sym1
) < sym_weight(sym2
);
595 static time_t last_refresh
;
597 static long userspace_events
;
598 static const char CONSOLE_CLEAR
[] = "\e[H\e[2J";
600 static struct sym_entry tmp
[MAX_SYMS
];
602 static void print_sym_table(void)
606 float events_per_sec
= events
/delay_secs
;
607 float kevents_per_sec
= (events
-userspace_events
)/delay_secs
;
609 memcpy(tmp
, sym_table
, sizeof(sym_table
[0])*sym_table_count
);
610 qsort(tmp
, sym_table_count
, sizeof(tmp
[0]), compare
);
612 write(1, CONSOLE_CLEAR
, strlen(CONSOLE_CLEAR
));
615 "------------------------------------------------------------------------------\n");
616 printf( " KernelTop:%8.0f irqs/sec kernel:%3.1f%% [%s, ",
618 100.0 - (100.0*((events_per_sec
-kevents_per_sec
)/events_per_sec
)),
619 nmi
? "NMI" : "IRQ");
621 if (nr_counters
== 1)
622 printf("%d ", event_count
[0]);
624 for (counter
= 0; counter
< nr_counters
; counter
++) {
628 printf("%s", event_name(counter
));
634 printf(" (tid: %d", tid
);
638 if (profile_cpu
!= -1)
639 printf(", cpu: %d)\n", profile_cpu
);
644 printf(", %d CPUs)\n", nr_cpus
);
647 printf("------------------------------------------------------------------------------\n\n");
649 if (nr_counters
== 1)
652 printf(" weight events");
654 printf(" RIP kernel function\n"
655 " ______ ______ ________________ _______________\n\n"
659 for (i
= 0; i
< sym_table_count
; i
++) {
662 if (nr_counters
== 1) {
664 tmp
[i
].count
[0] >= count_filter
) {
665 printf("%19.2f - %016llx : %s\n",
666 sym_weight(tmp
+ i
), tmp
[i
].addr
, tmp
[i
].sym
);
671 tmp
[i
].count
[0] >= count_filter
) {
672 printf("%8.1f %10ld - %016llx : %s\n",
675 tmp
[i
].addr
, tmp
[i
].sym
);
680 * Add decay to the counts:
682 for (count
= 0; count
< nr_counters
; count
++)
683 sym_table
[i
].count
[count
] = zero
? 0 : sym_table
[i
].count
[count
] * 7 / 8;
686 if (sym_filter_entry
)
687 show_details(sym_filter_entry
);
689 last_refresh
= time(NULL
);
692 struct pollfd stdin_poll
= { .fd
= 0, .events
= POLLIN
};
694 if (poll(&stdin_poll
, 1, 0) == 1) {
695 printf("key pressed - exiting.\n");
701 static int read_symbol(FILE *in
, struct sym_entry
*s
)
703 static int filter_match
= 0;
708 rc
= fscanf(in
, "%llx %c %499s", &s
->addr
, &stype
, str
);
714 /* skip until end of line: */
718 if (rc
== '\n' || rc
== EOF
|| pos
>= 499)
727 /* Filter out known duplicates and non-text symbols. */
728 if (!strcmp(sym
, "_text"))
730 if (!min_ip
&& !strcmp(sym
, "_stext"))
732 if (!strcmp(sym
, "_etext") || !strcmp(sym
, "_sinittext"))
734 if (stype
!= 'T' && stype
!= 't')
736 if (!strncmp("init_module", sym
, 11) || !strncmp("cleanup_module", sym
, 14))
738 if (strstr(sym
, "_text_start") || strstr(sym
, "_text_end"))
741 s
->sym
= malloc(strlen(str
));
744 strcpy((char *)s
->sym
, str
);
747 /* Tag events to be skipped. */
748 if (!strcmp("default_idle", s
->sym
) || !strcmp("cpu_idle", s
->sym
))
750 if (!strcmp("enter_idle", s
->sym
) || !strcmp("exit_idle", s
->sym
))
753 if (filter_match
== 1) {
754 filter_end
= s
->addr
;
756 if (filter_end
- filter_start
> 10000) {
757 printf("hm, too large filter symbol <%s> - skipping.\n",
759 printf("symbol filter start: %016lx\n", filter_start
);
760 printf(" end: %016lx\n", filter_end
);
761 filter_end
= filter_start
= 0;
766 if (filter_match
== 0 && sym_filter
&& !strcmp(s
->sym
, sym_filter
)) {
768 filter_start
= s
->addr
;
774 int compare_addr(const void *__sym1
, const void *__sym2
)
776 const struct sym_entry
*sym1
= __sym1
, *sym2
= __sym2
;
778 return sym1
->addr
> sym2
->addr
;
781 static void sort_symbol_table(void)
786 qsort(sym_table
, sym_table_count
, sizeof(sym_table
[0]), compare_addr
);
787 for (i
= 0, dups
= 0; i
< sym_table_count
; i
++) {
788 if (sym_table
[i
].addr
== sym_table
[i
+1].addr
) {
789 sym_table
[i
+1].addr
= -1ll;
793 sym_table_count
-= dups
;
797 static void parse_symbols(void)
799 struct sym_entry
*last
;
801 FILE *kallsyms
= fopen("/proc/kallsyms", "r");
804 printf("Could not open /proc/kallsyms - no CONFIG_KALLSYMS_ALL=y?\n");
808 while (!feof(kallsyms
)) {
809 if (read_symbol(kallsyms
, &sym_table
[sym_table_count
]) == 0) {
811 assert(sym_table_count
<= MAX_SYMS
);
816 min_ip
= sym_table
[0].addr
;
817 max_ip
= sym_table
[sym_table_count
-1].addr
;
818 last
= sym_table
+ sym_table_count
++;
825 for (count
=0; count
< sym_table_count
; count
++) {
826 if (!strcmp(sym_table
[count
].sym
, sym_filter
)) {
827 sym_filter_entry
= &sym_table
[count
];
835 for (i
= 0; i
< sym_table_count
; i
++)
836 fprintf(stderr
, "%llx %s\n",
837 sym_table
[i
].addr
, sym_table
[i
].sym
);
845 static void parse_vmlinux(char *filename
)
848 char command
[PATH_MAX
*2];
852 sprintf(command
, "objdump --start-address=0x%016lx --stop-address=0x%016lx -dS %s", filter_start
, filter_end
, filename
);
854 file
= popen(command
, "r");
858 while (!feof(file
)) {
859 struct source_line
*src
;
863 src
= malloc(sizeof(struct source_line
));
865 memset(src
, 0, sizeof(struct source_line
));
867 if (getline(&src
->line
, &dummy
, file
) < 0)
872 c
= strchr(src
->line
, '\n');
876 lines
= g_list_prepend(lines
, src
);
878 if (strlen(src
->line
)>8 && src
->line
[8] == ':')
879 src
->EIP
= strtoull(src
->line
, NULL
, 16);
880 if (strlen(src
->line
)>8 && src
->line
[16] == ':')
881 src
->EIP
= strtoull(src
->line
, NULL
, 16);
884 lines
= g_list_reverse(lines
);
887 static void record_precise_ip(uint64_t ip
)
889 struct source_line
*line
;
892 item
= g_list_first(lines
);
899 item
= g_list_next(item
);
903 static void lookup_sym_in_vmlinux(struct sym_entry
*sym
)
905 struct source_line
*line
;
907 char pattern
[PATH_MAX
];
908 sprintf(pattern
, "<%s>:", sym
->sym
);
910 item
= g_list_first(lines
);
913 if (strstr(line
->line
, pattern
)) {
917 item
= g_list_next(item
);
921 void show_lines(GList
*item_queue
, int item_queue_count
)
924 struct source_line
*line
;
926 for (i
= 0; i
< item_queue_count
; i
++) {
927 line
= item_queue
->data
;
928 printf("%8li\t%s\n", line
->count
, line
->line
);
929 item_queue
= g_list_next(item_queue
);
933 #define TRACE_COUNT 3
935 static void show_details(struct sym_entry
*sym
)
937 struct source_line
*line
;
940 GList
*item_queue
= NULL
;
941 int item_queue_count
= 0;
944 lookup_sym_in_vmlinux(sym
);
948 printf("Showing details for %s\n", sym
->sym
);
953 if (displayed
&& strstr(line
->line
, ">:"))
956 if (!item_queue_count
)
960 if (line
->count
>= count_filter
) {
961 show_lines(item_queue
, item_queue_count
);
962 item_queue_count
= 0;
964 } else if (item_queue_count
> TRACE_COUNT
) {
965 item_queue
= g_list_next(item_queue
);
973 item
= g_list_next(item
);
978 * Binary search in the histogram table and record the hit:
980 static void record_ip(uint64_t ip
, int counter
)
982 int left_idx
, middle_idx
, right_idx
, idx
;
983 unsigned long left
, middle
, right
;
985 record_precise_ip(ip
);
988 right_idx
= sym_table_count
-1;
989 assert(ip
<= max_ip
&& ip
>= min_ip
);
991 while (left_idx
+ 1 < right_idx
) {
992 middle_idx
= (left_idx
+ right_idx
) / 2;
994 left
= sym_table
[ left_idx
].addr
;
995 middle
= sym_table
[middle_idx
].addr
;
996 right
= sym_table
[ right_idx
].addr
;
998 if (!(left
<= middle
&& middle
<= right
)) {
999 printf("%016lx...\n%016lx...\n%016lx\n", left
, middle
, right
);
1000 printf("%d %d %d\n", left_idx
, middle_idx
, right_idx
);
1002 assert(left
<= middle
&& middle
<= right
);
1003 if (!(left
<= ip
&& ip
<= right
)) {
1004 printf(" left: %016lx\n", left
);
1005 printf(" ip: %016lx\n", (unsigned long)ip
);
1006 printf("right: %016lx\n", right
);
1008 assert(left
<= ip
&& ip
<= right
);
1010 * [ left .... target .... middle .... right ]
1011 * => right := middle
1014 right_idx
= middle_idx
;
1018 * [ left .... middle ... target ... right ]
1021 left_idx
= middle_idx
;
1026 if (!sym_table
[idx
].skip
)
1027 sym_table
[idx
].count
[counter
]++;
1031 static void process_event(uint64_t ip
, int counter
)
1035 if (ip
< min_ip
|| ip
> max_ip
) {
1040 record_ip(ip
, counter
);
1043 static void process_options(int argc
, char *argv
[])
1045 int error
= 0, counter
;
1047 if (strstr(argv
[0], "perfstat"))
1051 int option_index
= 0;
1052 /** Options for getopt */
1053 static struct option long_options
[] = {
1054 {"count", required_argument
, NULL
, 'c'},
1055 {"cpu", required_argument
, NULL
, 'C'},
1056 {"delay", required_argument
, NULL
, 'd'},
1057 {"dump_symtab", no_argument
, NULL
, 'D'},
1058 {"event", required_argument
, NULL
, 'e'},
1059 {"filter", required_argument
, NULL
, 'f'},
1060 {"group", required_argument
, NULL
, 'g'},
1061 {"help", no_argument
, NULL
, 'h'},
1062 {"nmi", required_argument
, NULL
, 'n'},
1063 {"pid", required_argument
, NULL
, 'p'},
1064 {"vmlinux", required_argument
, NULL
, 'x'},
1065 {"symbol", required_argument
, NULL
, 's'},
1066 {"stat", no_argument
, NULL
, 'S'},
1067 {"zero", no_argument
, NULL
, 'z'},
1070 int c
= getopt_long(argc
, argv
, "+:ac:C:d:De:f:g:hn:p:s:Sx:z",
1071 long_options
, &option_index
);
1076 case 'a': system_wide
= 1; break;
1077 case 'c': default_interval
= atoi(optarg
); break;
1079 /* CPU and PID are mutually exclusive */
1081 printf("WARNING: CPU switch overriding PID\n");
1085 profile_cpu
= atoi(optarg
); break;
1086 case 'd': delay_secs
= atoi(optarg
); break;
1087 case 'D': dump_symtab
= 1; break;
1089 case 'e': error
= parse_events(optarg
); break;
1091 case 'f': count_filter
= atoi(optarg
); break;
1092 case 'g': group
= atoi(optarg
); break;
1093 case 'h': display_help(); break;
1094 case 'n': nmi
= atoi(optarg
); break;
1096 /* CPU and PID are mutually exclusive */
1097 if (profile_cpu
!= -1) {
1098 printf("WARNING: PID switch overriding CPU\n");
1102 tid
= atoi(optarg
); break;
1103 case 's': sym_filter
= strdup(optarg
); break;
1104 case 'S': run_perfstat
= 1; break;
1105 case 'x': vmlinux
= strdup(optarg
); break;
1106 case 'z': zero
= 1; break;
1107 default: error
= 1; break;
1122 for (counter
= 0; counter
< nr_counters
; counter
++) {
1123 if (event_count
[counter
])
1126 event_count
[counter
] = default_interval
;
1137 static unsigned int mmap_read_head(struct mmap_data
*md
)
1139 struct perf_counter_mmap_page
*pc
= md
->base
;
1140 unsigned int seq
, head
;
1146 if (unlikely(seq
& 1)) {
1151 head
= pc
->data_head
;
1154 if (pc
->lock
!= seq
)
1160 static void mmap_read(struct mmap_data
*md
)
1162 unsigned int head
= mmap_read_head(md
);
1163 unsigned int old
= md
->prev
;
1164 unsigned char *data
= md
->base
+ page_size
;
1166 if (head
- old
> md
->mask
) {
1167 printf("ERROR: failed to keep up with mmap data\n");
1171 for (; old
!= head
;) {
1172 __u64
*ptr
= (__u64
*)&data
[old
& md
->mask
];
1173 old
+= sizeof(__u64
);
1175 process_event(*ptr
, md
->counter
);
1181 int main(int argc
, char *argv
[])
1183 struct pollfd event_array
[MAX_NR_CPUS
][MAX_COUNTERS
];
1184 struct mmap_data mmap_array
[MAX_NR_CPUS
][MAX_COUNTERS
];
1185 struct perf_counter_hw_event hw_event
;
1186 int i
, counter
, group_fd
;
1190 page_size
= sysconf(_SC_PAGE_SIZE
);
1192 process_options(argc
, argv
);
1194 nr_cpus
= sysconf(_SC_NPROCESSORS_ONLN
);
1195 assert(nr_cpus
<= MAX_NR_CPUS
);
1196 assert(nr_cpus
>= 0);
1199 return do_perfstat(argc
, argv
);
1201 if (tid
!= -1 || profile_cpu
!= -1)
1204 for (i
= 0; i
< nr_cpus
; i
++) {
1206 for (counter
= 0; counter
< nr_counters
; counter
++) {
1209 if (tid
== -1 && profile_cpu
== -1)
1212 memset(&hw_event
, 0, sizeof(hw_event
));
1213 hw_event
.config
= event_id
[counter
];
1214 hw_event
.irq_period
= event_count
[counter
];
1215 hw_event
.record_type
= PERF_RECORD_IRQ
;
1218 fd
[i
][counter
] = sys_perf_counter_open(&hw_event
, tid
, cpu
, group_fd
, 0);
1219 fcntl(fd
[i
][counter
], F_SETFL
, O_NONBLOCK
);
1220 if (fd
[i
][counter
] < 0) {
1221 printf("kerneltop error: syscall returned with %d (%s)\n",
1222 fd
[i
][counter
], strerror(-fd
[i
][counter
]));
1223 if (fd
[i
][counter
] == -1)
1224 printf("Are you root?\n");
1227 assert(fd
[i
][counter
] >= 0);
1230 * First counter acts as the group leader:
1232 if (group
&& group_fd
== -1)
1233 group_fd
= fd
[i
][counter
];
1235 event_array
[i
][counter
].fd
= fd
[i
][counter
];
1236 event_array
[i
][counter
].events
= POLLIN
;
1238 mmap_array
[i
][counter
].counter
= counter
;
1239 mmap_array
[i
][counter
].prev
= 0;
1240 mmap_array
[i
][counter
].mask
= 2*page_size
- 1;
1241 mmap_array
[i
][counter
].base
= mmap(NULL
, 3*page_size
,
1242 PROT_READ
, MAP_SHARED
, fd
[i
][counter
], 0);
1243 if (mmap_array
[i
][counter
].base
== MAP_FAILED
) {
1244 printf("kerneltop error: failed to mmap with %d (%s)\n",
1245 errno
, strerror(errno
));
1252 if (vmlinux
&& sym_filter_entry
)
1253 parse_vmlinux(vmlinux
);
1255 printf("KernelTop refresh period: %d seconds\n", delay_secs
);
1256 last_refresh
= time(NULL
);
1261 for (i
= 0; i
< nr_cpus
; i
++) {
1262 for (counter
= 0; counter
< nr_counters
; counter
++)
1263 mmap_read(&mmap_array
[i
][counter
]);
1266 if (time(NULL
) >= last_refresh
+ delay_secs
) {
1268 events
= userspace_events
= 0;
1272 ret
= poll(event_array
[0], nr_cpus
, 1000);