perf_counter tools: increase cpu-cycles again
[deliverable/linux.git] / Documentation / perf_counter / kerneltop.c
1 /*
2 * kerneltop.c: show top kernel functions - performance counters showcase
3
4 Build with:
5
6 cc -O6 -Wall -lrt `pkg-config --cflags --libs glib-2.0` -o kerneltop kerneltop.c
7
8 Sample output:
9
10 ------------------------------------------------------------------------------
11 KernelTop: 2669 irqs/sec [NMI, cache-misses/cache-refs], (all, cpu: 2)
12 ------------------------------------------------------------------------------
13
14 weight RIP kernel function
15 ______ ________________ _______________
16
17 35.20 - ffffffff804ce74b : skb_copy_and_csum_dev
18 33.00 - ffffffff804cb740 : sock_alloc_send_skb
19 31.26 - ffffffff804ce808 : skb_push
20 22.43 - ffffffff80510004 : tcp_established_options
21 19.00 - ffffffff8027d250 : find_get_page
22 15.76 - ffffffff804e4fc9 : eth_type_trans
23 15.20 - ffffffff804d8baa : dst_release
24 14.86 - ffffffff804cf5d8 : skb_release_head_state
25 14.00 - ffffffff802217d5 : read_hpet
26 12.00 - ffffffff804ffb7f : __ip_local_out
27 11.97 - ffffffff804fc0c8 : ip_local_deliver_finish
28 8.54 - ffffffff805001a3 : ip_queue_xmit
29 */
30
31 /*
32 * perfstat: /usr/bin/time -alike performance counter statistics utility
33
34 It summarizes the counter events of all tasks (and child tasks),
35 covering all CPUs that the command (or workload) executes on.
36 It only counts the per-task events of the workload started,
37 independent of how many other tasks run on those CPUs.
38
39 Sample output:
40
41 $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null
42
43 Performance counter stats for 'ls':
44
45 163516953 instructions
46 2295 cache-misses
47 2855182 branch-misses
48 */
49
50 /*
51 * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
52 *
53 * Improvements and fixes by:
54 *
55 * Arjan van de Ven <arjan@linux.intel.com>
56 * Yanmin Zhang <yanmin.zhang@intel.com>
57 * Wu Fengguang <fengguang.wu@intel.com>
58 * Mike Galbraith <efault@gmx.de>
59 *
60 * Released under the GPL v2. (and only v2, not any later version)
61 */
62
63 #define _GNU_SOURCE
64 #include <sys/types.h>
65 #include <sys/stat.h>
66 #include <sys/time.h>
67 #include <unistd.h>
68 #include <stdint.h>
69 #include <stdlib.h>
70 #include <string.h>
71 #include <getopt.h>
72 #include <assert.h>
73 #include <fcntl.h>
74 #include <stdio.h>
75 #include <errno.h>
76 #include <ctype.h>
77 #include <time.h>
78
79 #include <glib.h>
80
81 #include <sys/syscall.h>
82 #include <sys/ioctl.h>
83 #include <sys/poll.h>
84 #include <sys/prctl.h>
85 #include <sys/wait.h>
86 #include <sys/uio.h>
87 #include <sys/mman.h>
88
89 #include <linux/unistd.h>
90
91 #include "../../include/linux/perf_counter.h"
92
93
94 /*
95 * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
96 * counters in the current task.
97 */
98 #define PR_TASK_PERF_COUNTERS_DISABLE 31
99 #define PR_TASK_PERF_COUNTERS_ENABLE 32
100
101 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
102
103 #define rdclock() \
104 ({ \
105 struct timespec ts; \
106 \
107 clock_gettime(CLOCK_MONOTONIC, &ts); \
108 ts.tv_sec * 1000000000ULL + ts.tv_nsec; \
109 })
110
111 /*
112 * Pick up some kernel type conventions:
113 */
114 #define __user
115 #define asmlinkage
116
117 typedef unsigned int __u32;
118 typedef unsigned long long __u64;
119 typedef long long __s64;
120
121
122 #ifdef __x86_64__
123 #define __NR_perf_counter_open 295
124 #define rmb() asm volatile("lfence" ::: "memory")
125 #define cpu_relax() asm volatile("rep; nop" ::: "memory");
126 #endif
127
128 #ifdef __i386__
129 #define __NR_perf_counter_open 333
130 #define rmb() asm volatile("lfence" ::: "memory")
131 #define cpu_relax() asm volatile("rep; nop" ::: "memory");
132 #endif
133
134 #ifdef __powerpc__
135 #define __NR_perf_counter_open 319
136 #define rmb() asm volatile ("sync" ::: "memory")
137 #define cpu_relax() asm volatile ("" ::: "memory");
138 #endif
139
140 #define unlikely(x) __builtin_expect(!!(x), 0)
141
142 asmlinkage int sys_perf_counter_open(
143 struct perf_counter_hw_event *hw_event_uptr __user,
144 pid_t pid,
145 int cpu,
146 int group_fd,
147 unsigned long flags)
148 {
149 int ret;
150
151 ret = syscall(
152 __NR_perf_counter_open, hw_event_uptr, pid, cpu, group_fd, flags);
153 #if defined(__x86_64__) || defined(__i386__)
154 if (ret < 0 && ret > -4096) {
155 errno = -ret;
156 ret = -1;
157 }
158 #endif
159 return ret;
160 }
161
162 #define MAX_COUNTERS 64
163 #define MAX_NR_CPUS 256
164
165 #define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id))
166
167 static int run_perfstat = 0;
168 static int system_wide = 0;
169
170 static int nr_counters = 0;
171 static __u64 event_id[MAX_COUNTERS] = {
172 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),
173 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),
174 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),
175 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),
176
177 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),
178 EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),
179 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),
180 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),
181 };
182 static int default_interval = 100000;
183 static int event_count[MAX_COUNTERS];
184 static int fd[MAX_NR_CPUS][MAX_COUNTERS];
185
186 static __u64 count_filter = 100;
187
188 static int tid = -1;
189 static int profile_cpu = -1;
190 static int nr_cpus = 0;
191 static int nmi = 1;
192 static int group = 0;
193 static unsigned int page_size;
194
195 static char *vmlinux;
196
197 static char *sym_filter;
198 static unsigned long filter_start;
199 static unsigned long filter_end;
200
201 static int delay_secs = 2;
202 static int zero;
203 static int dump_symtab;
204
205 static GList *lines;
206
207 struct source_line {
208 uint64_t EIP;
209 unsigned long count;
210 char *line;
211 };
212
213
214 const unsigned int default_count[] = {
215 1000000,
216 1000000,
217 10000,
218 10000,
219 1000000,
220 10000,
221 };
222
223 static char *hw_event_names[] = {
224 "CPU cycles",
225 "instructions",
226 "cache references",
227 "cache misses",
228 "branches",
229 "branch misses",
230 "bus cycles",
231 };
232
233 static char *sw_event_names[] = {
234 "cpu clock ticks",
235 "task clock ticks",
236 "pagefaults",
237 "context switches",
238 "CPU migrations",
239 "minor faults",
240 "major faults",
241 };
242
243 struct event_symbol {
244 __u64 event;
245 char *symbol;
246 };
247
248 static struct event_symbol event_symbols[] = {
249 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cpu-cycles", },
250 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cycles", },
251 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS), "instructions", },
252 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES), "cache-references", },
253 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES), "cache-misses", },
254 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branch-instructions", },
255 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branches", },
256 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES), "branch-misses", },
257 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES), "bus-cycles", },
258
259 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK), "cpu-clock", },
260 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK), "task-clock", },
261 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "page-faults", },
262 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "faults", },
263 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN), "minor-faults", },
264 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ), "major-faults", },
265 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "context-switches", },
266 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "cs", },
267 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "cpu-migrations", },
268 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "migrations", },
269 };
270
271 #define __PERF_COUNTER_FIELD(config, name) \
272 ((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
273
274 #define PERF_COUNTER_RAW(config) __PERF_COUNTER_FIELD(config, RAW)
275 #define PERF_COUNTER_CONFIG(config) __PERF_COUNTER_FIELD(config, CONFIG)
276 #define PERF_COUNTER_TYPE(config) __PERF_COUNTER_FIELD(config, TYPE)
277 #define PERF_COUNTER_ID(config) __PERF_COUNTER_FIELD(config, EVENT)
278
279 static void display_events_help(void)
280 {
281 unsigned int i;
282 __u64 e;
283
284 printf(
285 " -e EVENT --event=EVENT # symbolic-name abbreviations");
286
287 for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
288 int type, id;
289
290 e = event_symbols[i].event;
291 type = PERF_COUNTER_TYPE(e);
292 id = PERF_COUNTER_ID(e);
293
294 printf("\n %d:%d: %-20s",
295 type, id, event_symbols[i].symbol);
296 }
297
298 printf("\n"
299 " rNNN: raw PMU events (eventsel+umask)\n\n");
300 }
301
302 static void display_perfstat_help(void)
303 {
304 printf(
305 "Usage: perfstat [<events...>] <cmd...>\n\n"
306 "PerfStat Options (up to %d event types can be specified):\n\n",
307 MAX_COUNTERS);
308
309 display_events_help();
310
311 printf(
312 " -a # system-wide collection\n");
313 exit(0);
314 }
315
316 static void display_help(void)
317 {
318 if (run_perfstat)
319 return display_perfstat_help();
320
321 printf(
322 "Usage: kerneltop [<options>]\n"
323 " Or: kerneltop -S [<options>] COMMAND [ARGS]\n\n"
324 "KernelTop Options (up to %d event types can be specified at once):\n\n",
325 MAX_COUNTERS);
326
327 display_events_help();
328
329 printf(
330 " -S --stat # perfstat COMMAND\n"
331 " -a # system-wide collection (for perfstat)\n\n"
332 " -c CNT --count=CNT # event period to sample\n\n"
333 " -C CPU --cpu=CPU # CPU (-1 for all) [default: -1]\n"
334 " -p PID --pid=PID # PID of sampled task (-1 for all) [default: -1]\n\n"
335 " -d delay --delay=<seconds> # sampling/display delay [default: 2]\n"
336 " -f CNT --filter=CNT # min-event-count filter [default: 100]\n\n"
337 " -s symbol --symbol=<symbol> # function to be showed annotated one-shot\n"
338 " -x path --vmlinux=<path> # the vmlinux binary, required for -s use\n"
339 " -z --zero # zero counts after display\n"
340 " -D --dump_symtab # dump symbol table to stderr on startup\n"
341 );
342
343 exit(0);
344 }
345
346 static char *event_name(int ctr)
347 {
348 __u64 config = event_id[ctr];
349 int type = PERF_COUNTER_TYPE(config);
350 int id = PERF_COUNTER_ID(config);
351 static char buf[32];
352
353 if (PERF_COUNTER_RAW(config)) {
354 sprintf(buf, "raw 0x%llx", PERF_COUNTER_CONFIG(config));
355 return buf;
356 }
357
358 switch (type) {
359 case PERF_TYPE_HARDWARE:
360 if (id < PERF_HW_EVENTS_MAX)
361 return hw_event_names[id];
362 return "unknown-hardware";
363
364 case PERF_TYPE_SOFTWARE:
365 if (id < PERF_SW_EVENTS_MAX)
366 return sw_event_names[id];
367 return "unknown-software";
368
369 default:
370 break;
371 }
372
373 return "unknown";
374 }
375
376 /*
377 * Each event can have multiple symbolic names.
378 * Symbolic names are (almost) exactly matched.
379 */
380 static __u64 match_event_symbols(char *str)
381 {
382 __u64 config, id;
383 int type;
384 unsigned int i;
385
386 if (sscanf(str, "r%llx", &config) == 1)
387 return config | PERF_COUNTER_RAW_MASK;
388
389 if (sscanf(str, "%d:%llu", &type, &id) == 2)
390 return EID(type, id);
391
392 for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
393 if (!strncmp(str, event_symbols[i].symbol,
394 strlen(event_symbols[i].symbol)))
395 return event_symbols[i].event;
396 }
397
398 return ~0ULL;
399 }
400
401 static int parse_events(char *str)
402 {
403 __u64 config;
404
405 again:
406 if (nr_counters == MAX_COUNTERS)
407 return -1;
408
409 config = match_event_symbols(str);
410 if (config == ~0ULL)
411 return -1;
412
413 event_id[nr_counters] = config;
414 nr_counters++;
415
416 str = strstr(str, ",");
417 if (str) {
418 str++;
419 goto again;
420 }
421
422 return 0;
423 }
424
425
426 /*
427 * perfstat
428 */
429
430 char fault_here[1000000];
431
432 static void create_perfstat_counter(int counter)
433 {
434 struct perf_counter_hw_event hw_event;
435
436 memset(&hw_event, 0, sizeof(hw_event));
437 hw_event.config = event_id[counter];
438 hw_event.record_type = PERF_RECORD_SIMPLE;
439 hw_event.nmi = 0;
440
441 if (system_wide) {
442 int cpu;
443 for (cpu = 0; cpu < nr_cpus; cpu ++) {
444 fd[cpu][counter] = sys_perf_counter_open(&hw_event, -1, cpu, -1, 0);
445 if (fd[cpu][counter] < 0) {
446 printf("perfstat error: syscall returned with %d (%s)\n",
447 fd[cpu][counter], strerror(errno));
448 exit(-1);
449 }
450 }
451 } else {
452 hw_event.inherit = 1;
453 hw_event.disabled = 1;
454
455 fd[0][counter] = sys_perf_counter_open(&hw_event, 0, -1, -1, 0);
456 if (fd[0][counter] < 0) {
457 printf("perfstat error: syscall returned with %d (%s)\n",
458 fd[0][counter], strerror(errno));
459 exit(-1);
460 }
461 }
462 }
463
464 int do_perfstat(int argc, char *argv[])
465 {
466 unsigned long long t0, t1;
467 int counter;
468 ssize_t res;
469 int status;
470 int pid;
471
472 if (!system_wide)
473 nr_cpus = 1;
474
475 for (counter = 0; counter < nr_counters; counter++)
476 create_perfstat_counter(counter);
477
478 argc -= optind;
479 argv += optind;
480
481 if (!argc)
482 display_help();
483
484 /*
485 * Enable counters and exec the command:
486 */
487 t0 = rdclock();
488 prctl(PR_TASK_PERF_COUNTERS_ENABLE);
489
490 if ((pid = fork()) < 0)
491 perror("failed to fork");
492 if (!pid) {
493 if (execvp(argv[0], argv)) {
494 perror(argv[0]);
495 exit(-1);
496 }
497 }
498 while (wait(&status) >= 0)
499 ;
500 prctl(PR_TASK_PERF_COUNTERS_DISABLE);
501 t1 = rdclock();
502
503 fflush(stdout);
504
505 fprintf(stderr, "\n");
506 fprintf(stderr, " Performance counter stats for \'%s\':\n",
507 argv[0]);
508 fprintf(stderr, "\n");
509
510 for (counter = 0; counter < nr_counters; counter++) {
511 int cpu;
512 __u64 count, single_count;
513
514 count = 0;
515 for (cpu = 0; cpu < nr_cpus; cpu ++) {
516 res = read(fd[cpu][counter],
517 (char *) &single_count, sizeof(single_count));
518 assert(res == sizeof(single_count));
519 count += single_count;
520 }
521
522 if (!PERF_COUNTER_RAW(event_id[counter]) &&
523 (event_id[counter] == PERF_COUNT_CPU_CLOCK ||
524 event_id[counter] == PERF_COUNT_TASK_CLOCK)) {
525
526 double msecs = (double)count / 1000000;
527
528 fprintf(stderr, " %14.6f %-20s (msecs)\n",
529 msecs, event_name(counter));
530 } else {
531 fprintf(stderr, " %14Ld %-20s (events)\n",
532 count, event_name(counter));
533 }
534 if (!counter)
535 fprintf(stderr, "\n");
536 }
537 fprintf(stderr, "\n");
538 fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n",
539 (double)(t1-t0)/1e6);
540 fprintf(stderr, "\n");
541
542 return 0;
543 }
544
545 /*
546 * Symbols
547 */
548
549 static uint64_t min_ip;
550 static uint64_t max_ip = -1ll;
551
552 struct sym_entry {
553 unsigned long long addr;
554 char *sym;
555 unsigned long count[MAX_COUNTERS];
556 int skip;
557 GList *source;
558 };
559
560 #define MAX_SYMS 100000
561
562 static int sym_table_count;
563
564 struct sym_entry *sym_filter_entry;
565
566 static struct sym_entry sym_table[MAX_SYMS];
567
568 static void show_details(struct sym_entry *sym);
569
570 /*
571 * Ordering weight: count-1 * count-2 * ... / count-n
572 */
573 static double sym_weight(const struct sym_entry *sym)
574 {
575 double weight;
576 int counter;
577
578 weight = sym->count[0];
579
580 for (counter = 1; counter < nr_counters-1; counter++)
581 weight *= sym->count[counter];
582
583 weight /= (sym->count[counter] + 1);
584
585 return weight;
586 }
587
588 static int compare(const void *__sym1, const void *__sym2)
589 {
590 const struct sym_entry *sym1 = __sym1, *sym2 = __sym2;
591
592 return sym_weight(sym1) < sym_weight(sym2);
593 }
594
595 static time_t last_refresh;
596 static long events;
597 static long userspace_events;
598 static const char CONSOLE_CLEAR[] = "\e[H\e[2J";
599
600 static struct sym_entry tmp[MAX_SYMS];
601
602 static void print_sym_table(void)
603 {
604 int i, printed;
605 int counter;
606 float events_per_sec = events/delay_secs;
607 float kevents_per_sec = (events-userspace_events)/delay_secs;
608
609 memcpy(tmp, sym_table, sizeof(sym_table[0])*sym_table_count);
610 qsort(tmp, sym_table_count, sizeof(tmp[0]), compare);
611
612 write(1, CONSOLE_CLEAR, strlen(CONSOLE_CLEAR));
613
614 printf(
615 "------------------------------------------------------------------------------\n");
616 printf( " KernelTop:%8.0f irqs/sec kernel:%3.1f%% [%s, ",
617 events_per_sec,
618 100.0 - (100.0*((events_per_sec-kevents_per_sec)/events_per_sec)),
619 nmi ? "NMI" : "IRQ");
620
621 if (nr_counters == 1)
622 printf("%d ", event_count[0]);
623
624 for (counter = 0; counter < nr_counters; counter++) {
625 if (counter)
626 printf("/");
627
628 printf("%s", event_name(counter));
629 }
630
631 printf( "], ");
632
633 if (tid != -1)
634 printf(" (tid: %d", tid);
635 else
636 printf(" (all");
637
638 if (profile_cpu != -1)
639 printf(", cpu: %d)\n", profile_cpu);
640 else {
641 if (tid != -1)
642 printf(")\n");
643 else
644 printf(", %d CPUs)\n", nr_cpus);
645 }
646
647 printf("------------------------------------------------------------------------------\n\n");
648
649 if (nr_counters == 1)
650 printf(" events");
651 else
652 printf(" weight events");
653
654 printf(" RIP kernel function\n"
655 " ______ ______ ________________ _______________\n\n"
656 );
657
658 printed = 0;
659 for (i = 0; i < sym_table_count; i++) {
660 int count;
661
662 if (nr_counters == 1) {
663 if (printed <= 18 &&
664 tmp[i].count[0] >= count_filter) {
665 printf("%19.2f - %016llx : %s\n",
666 sym_weight(tmp + i), tmp[i].addr, tmp[i].sym);
667 printed++;
668 }
669 } else {
670 if (printed <= 18 &&
671 tmp[i].count[0] >= count_filter) {
672 printf("%8.1f %10ld - %016llx : %s\n",
673 sym_weight(tmp + i),
674 tmp[i].count[0],
675 tmp[i].addr, tmp[i].sym);
676 printed++;
677 }
678 }
679 /*
680 * Add decay to the counts:
681 */
682 for (count = 0; count < nr_counters; count++)
683 sym_table[i].count[count] = zero ? 0 : sym_table[i].count[count] * 7 / 8;
684 }
685
686 if (sym_filter_entry)
687 show_details(sym_filter_entry);
688
689 last_refresh = time(NULL);
690
691 {
692 struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
693
694 if (poll(&stdin_poll, 1, 0) == 1) {
695 printf("key pressed - exiting.\n");
696 exit(0);
697 }
698 }
699 }
700
701 static int read_symbol(FILE *in, struct sym_entry *s)
702 {
703 static int filter_match = 0;
704 char *sym, stype;
705 char str[500];
706 int rc, pos;
707
708 rc = fscanf(in, "%llx %c %499s", &s->addr, &stype, str);
709 if (rc == EOF)
710 return -1;
711
712 assert(rc == 3);
713
714 /* skip until end of line: */
715 pos = strlen(str);
716 do {
717 rc = fgetc(in);
718 if (rc == '\n' || rc == EOF || pos >= 499)
719 break;
720 str[pos] = rc;
721 pos++;
722 } while (1);
723 str[pos] = 0;
724
725 sym = str;
726
727 /* Filter out known duplicates and non-text symbols. */
728 if (!strcmp(sym, "_text"))
729 return 1;
730 if (!min_ip && !strcmp(sym, "_stext"))
731 return 1;
732 if (!strcmp(sym, "_etext") || !strcmp(sym, "_sinittext"))
733 return 1;
734 if (stype != 'T' && stype != 't')
735 return 1;
736 if (!strncmp("init_module", sym, 11) || !strncmp("cleanup_module", sym, 14))
737 return 1;
738 if (strstr(sym, "_text_start") || strstr(sym, "_text_end"))
739 return 1;
740
741 s->sym = malloc(strlen(str));
742 assert(s->sym);
743
744 strcpy((char *)s->sym, str);
745 s->skip = 0;
746
747 /* Tag events to be skipped. */
748 if (!strcmp("default_idle", s->sym) || !strcmp("cpu_idle", s->sym))
749 s->skip = 1;
750 if (!strcmp("enter_idle", s->sym) || !strcmp("exit_idle", s->sym))
751 s->skip = 1;
752
753 if (filter_match == 1) {
754 filter_end = s->addr;
755 filter_match = -1;
756 if (filter_end - filter_start > 10000) {
757 printf("hm, too large filter symbol <%s> - skipping.\n",
758 sym_filter);
759 printf("symbol filter start: %016lx\n", filter_start);
760 printf(" end: %016lx\n", filter_end);
761 filter_end = filter_start = 0;
762 sym_filter = NULL;
763 sleep(1);
764 }
765 }
766 if (filter_match == 0 && sym_filter && !strcmp(s->sym, sym_filter)) {
767 filter_match = 1;
768 filter_start = s->addr;
769 }
770
771 return 0;
772 }
773
774 int compare_addr(const void *__sym1, const void *__sym2)
775 {
776 const struct sym_entry *sym1 = __sym1, *sym2 = __sym2;
777
778 return sym1->addr > sym2->addr;
779 }
780
781 static void sort_symbol_table(void)
782 {
783 int i, dups;
784
785 do {
786 qsort(sym_table, sym_table_count, sizeof(sym_table[0]), compare_addr);
787 for (i = 0, dups = 0; i < sym_table_count; i++) {
788 if (sym_table[i].addr == sym_table[i+1].addr) {
789 sym_table[i+1].addr = -1ll;
790 dups++;
791 }
792 }
793 sym_table_count -= dups;
794 } while(dups);
795 }
796
797 static void parse_symbols(void)
798 {
799 struct sym_entry *last;
800
801 FILE *kallsyms = fopen("/proc/kallsyms", "r");
802
803 if (!kallsyms) {
804 printf("Could not open /proc/kallsyms - no CONFIG_KALLSYMS_ALL=y?\n");
805 exit(-1);
806 }
807
808 while (!feof(kallsyms)) {
809 if (read_symbol(kallsyms, &sym_table[sym_table_count]) == 0) {
810 sym_table_count++;
811 assert(sym_table_count <= MAX_SYMS);
812 }
813 }
814
815 sort_symbol_table();
816 min_ip = sym_table[0].addr;
817 max_ip = sym_table[sym_table_count-1].addr;
818 last = sym_table + sym_table_count++;
819
820 last->addr = -1ll;
821 last->sym = "<end>";
822
823 if (filter_end) {
824 int count;
825 for (count=0; count < sym_table_count; count ++) {
826 if (!strcmp(sym_table[count].sym, sym_filter)) {
827 sym_filter_entry = &sym_table[count];
828 break;
829 }
830 }
831 }
832 if (dump_symtab) {
833 int i;
834
835 for (i = 0; i < sym_table_count; i++)
836 fprintf(stderr, "%llx %s\n",
837 sym_table[i].addr, sym_table[i].sym);
838 }
839 }
840
841 /*
842 * Source lines
843 */
844
845 static void parse_vmlinux(char *filename)
846 {
847 FILE *file;
848 char command[PATH_MAX*2];
849 if (!filename)
850 return;
851
852 sprintf(command, "objdump --start-address=0x%016lx --stop-address=0x%016lx -dS %s", filter_start, filter_end, filename);
853
854 file = popen(command, "r");
855 if (!file)
856 return;
857
858 while (!feof(file)) {
859 struct source_line *src;
860 size_t dummy = 0;
861 char *c;
862
863 src = malloc(sizeof(struct source_line));
864 assert(src != NULL);
865 memset(src, 0, sizeof(struct source_line));
866
867 if (getline(&src->line, &dummy, file) < 0)
868 break;
869 if (!src->line)
870 break;
871
872 c = strchr(src->line, '\n');
873 if (c)
874 *c = 0;
875
876 lines = g_list_prepend(lines, src);
877
878 if (strlen(src->line)>8 && src->line[8] == ':')
879 src->EIP = strtoull(src->line, NULL, 16);
880 if (strlen(src->line)>8 && src->line[16] == ':')
881 src->EIP = strtoull(src->line, NULL, 16);
882 }
883 pclose(file);
884 lines = g_list_reverse(lines);
885 }
886
887 static void record_precise_ip(uint64_t ip)
888 {
889 struct source_line *line;
890 GList *item;
891
892 item = g_list_first(lines);
893 while (item) {
894 line = item->data;
895 if (line->EIP == ip)
896 line->count++;
897 if (line->EIP > ip)
898 break;
899 item = g_list_next(item);
900 }
901 }
902
903 static void lookup_sym_in_vmlinux(struct sym_entry *sym)
904 {
905 struct source_line *line;
906 GList *item;
907 char pattern[PATH_MAX];
908 sprintf(pattern, "<%s>:", sym->sym);
909
910 item = g_list_first(lines);
911 while (item) {
912 line = item->data;
913 if (strstr(line->line, pattern)) {
914 sym->source = item;
915 break;
916 }
917 item = g_list_next(item);
918 }
919 }
920
921 void show_lines(GList *item_queue, int item_queue_count)
922 {
923 int i;
924 struct source_line *line;
925
926 for (i = 0; i < item_queue_count; i++) {
927 line = item_queue->data;
928 printf("%8li\t%s\n", line->count, line->line);
929 item_queue = g_list_next(item_queue);
930 }
931 }
932
933 #define TRACE_COUNT 3
934
935 static void show_details(struct sym_entry *sym)
936 {
937 struct source_line *line;
938 GList *item;
939 int displayed = 0;
940 GList *item_queue = NULL;
941 int item_queue_count = 0;
942
943 if (!sym->source)
944 lookup_sym_in_vmlinux(sym);
945 if (!sym->source)
946 return;
947
948 printf("Showing details for %s\n", sym->sym);
949
950 item = sym->source;
951 while (item) {
952 line = item->data;
953 if (displayed && strstr(line->line, ">:"))
954 break;
955
956 if (!item_queue_count)
957 item_queue = item;
958 item_queue_count ++;
959
960 if (line->count >= count_filter) {
961 show_lines(item_queue, item_queue_count);
962 item_queue_count = 0;
963 item_queue = NULL;
964 } else if (item_queue_count > TRACE_COUNT) {
965 item_queue = g_list_next(item_queue);
966 item_queue_count --;
967 }
968
969 line->count = 0;
970 displayed++;
971 if (displayed > 300)
972 break;
973 item = g_list_next(item);
974 }
975 }
976
977 /*
978 * Binary search in the histogram table and record the hit:
979 */
980 static void record_ip(uint64_t ip, int counter)
981 {
982 int left_idx, middle_idx, right_idx, idx;
983 unsigned long left, middle, right;
984
985 record_precise_ip(ip);
986
987 left_idx = 0;
988 right_idx = sym_table_count-1;
989 assert(ip <= max_ip && ip >= min_ip);
990
991 while (left_idx + 1 < right_idx) {
992 middle_idx = (left_idx + right_idx) / 2;
993
994 left = sym_table[ left_idx].addr;
995 middle = sym_table[middle_idx].addr;
996 right = sym_table[ right_idx].addr;
997
998 if (!(left <= middle && middle <= right)) {
999 printf("%016lx...\n%016lx...\n%016lx\n", left, middle, right);
1000 printf("%d %d %d\n", left_idx, middle_idx, right_idx);
1001 }
1002 assert(left <= middle && middle <= right);
1003 if (!(left <= ip && ip <= right)) {
1004 printf(" left: %016lx\n", left);
1005 printf(" ip: %016lx\n", (unsigned long)ip);
1006 printf("right: %016lx\n", right);
1007 }
1008 assert(left <= ip && ip <= right);
1009 /*
1010 * [ left .... target .... middle .... right ]
1011 * => right := middle
1012 */
1013 if (ip < middle) {
1014 right_idx = middle_idx;
1015 continue;
1016 }
1017 /*
1018 * [ left .... middle ... target ... right ]
1019 * => left := middle
1020 */
1021 left_idx = middle_idx;
1022 }
1023
1024 idx = left_idx;
1025
1026 if (!sym_table[idx].skip)
1027 sym_table[idx].count[counter]++;
1028 else events--;
1029 }
1030
1031 static void process_event(uint64_t ip, int counter)
1032 {
1033 events++;
1034
1035 if (ip < min_ip || ip > max_ip) {
1036 userspace_events++;
1037 return;
1038 }
1039
1040 record_ip(ip, counter);
1041 }
1042
1043 static void process_options(int argc, char *argv[])
1044 {
1045 int error = 0, counter;
1046
1047 if (strstr(argv[0], "perfstat"))
1048 run_perfstat = 1;
1049
1050 for (;;) {
1051 int option_index = 0;
1052 /** Options for getopt */
1053 static struct option long_options[] = {
1054 {"count", required_argument, NULL, 'c'},
1055 {"cpu", required_argument, NULL, 'C'},
1056 {"delay", required_argument, NULL, 'd'},
1057 {"dump_symtab", no_argument, NULL, 'D'},
1058 {"event", required_argument, NULL, 'e'},
1059 {"filter", required_argument, NULL, 'f'},
1060 {"group", required_argument, NULL, 'g'},
1061 {"help", no_argument, NULL, 'h'},
1062 {"nmi", required_argument, NULL, 'n'},
1063 {"pid", required_argument, NULL, 'p'},
1064 {"vmlinux", required_argument, NULL, 'x'},
1065 {"symbol", required_argument, NULL, 's'},
1066 {"stat", no_argument, NULL, 'S'},
1067 {"zero", no_argument, NULL, 'z'},
1068 {NULL, 0, NULL, 0 }
1069 };
1070 int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hn:p:s:Sx:z",
1071 long_options, &option_index);
1072 if (c == -1)
1073 break;
1074
1075 switch (c) {
1076 case 'a': system_wide = 1; break;
1077 case 'c': default_interval = atoi(optarg); break;
1078 case 'C':
1079 /* CPU and PID are mutually exclusive */
1080 if (tid != -1) {
1081 printf("WARNING: CPU switch overriding PID\n");
1082 sleep(1);
1083 tid = -1;
1084 }
1085 profile_cpu = atoi(optarg); break;
1086 case 'd': delay_secs = atoi(optarg); break;
1087 case 'D': dump_symtab = 1; break;
1088
1089 case 'e': error = parse_events(optarg); break;
1090
1091 case 'f': count_filter = atoi(optarg); break;
1092 case 'g': group = atoi(optarg); break;
1093 case 'h': display_help(); break;
1094 case 'n': nmi = atoi(optarg); break;
1095 case 'p':
1096 /* CPU and PID are mutually exclusive */
1097 if (profile_cpu != -1) {
1098 printf("WARNING: PID switch overriding CPU\n");
1099 sleep(1);
1100 profile_cpu = -1;
1101 }
1102 tid = atoi(optarg); break;
1103 case 's': sym_filter = strdup(optarg); break;
1104 case 'S': run_perfstat = 1; break;
1105 case 'x': vmlinux = strdup(optarg); break;
1106 case 'z': zero = 1; break;
1107 default: error = 1; break;
1108 }
1109 }
1110 if (error)
1111 display_help();
1112
1113 if (!nr_counters) {
1114 if (run_perfstat)
1115 nr_counters = 8;
1116 else {
1117 nr_counters = 1;
1118 event_id[0] = 0;
1119 }
1120 }
1121
1122 for (counter = 0; counter < nr_counters; counter++) {
1123 if (event_count[counter])
1124 continue;
1125
1126 event_count[counter] = default_interval;
1127 }
1128 }
1129
1130 struct mmap_data {
1131 int counter;
1132 void *base;
1133 unsigned int mask;
1134 unsigned int prev;
1135 };
1136
1137 static unsigned int mmap_read_head(struct mmap_data *md)
1138 {
1139 struct perf_counter_mmap_page *pc = md->base;
1140 unsigned int seq, head;
1141
1142 repeat:
1143 rmb();
1144 seq = pc->lock;
1145
1146 if (unlikely(seq & 1)) {
1147 cpu_relax();
1148 goto repeat;
1149 }
1150
1151 head = pc->data_head;
1152
1153 rmb();
1154 if (pc->lock != seq)
1155 goto repeat;
1156
1157 return head;
1158 }
1159
1160 static void mmap_read(struct mmap_data *md)
1161 {
1162 unsigned int head = mmap_read_head(md);
1163 unsigned int old = md->prev;
1164 unsigned char *data = md->base + page_size;
1165
1166 if (head - old > md->mask) {
1167 printf("ERROR: failed to keep up with mmap data\n");
1168 exit(-1);
1169 }
1170
1171 for (; old != head;) {
1172 __u64 *ptr = (__u64 *)&data[old & md->mask];
1173 old += sizeof(__u64);
1174
1175 process_event(*ptr, md->counter);
1176 }
1177
1178 md->prev = old;
1179 }
1180
1181 int main(int argc, char *argv[])
1182 {
1183 struct pollfd event_array[MAX_NR_CPUS][MAX_COUNTERS];
1184 struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
1185 struct perf_counter_hw_event hw_event;
1186 int i, counter, group_fd;
1187 unsigned int cpu;
1188 int ret;
1189
1190 page_size = sysconf(_SC_PAGE_SIZE);
1191
1192 process_options(argc, argv);
1193
1194 nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
1195 assert(nr_cpus <= MAX_NR_CPUS);
1196 assert(nr_cpus >= 0);
1197
1198 if (run_perfstat)
1199 return do_perfstat(argc, argv);
1200
1201 if (tid != -1 || profile_cpu != -1)
1202 nr_cpus = 1;
1203
1204 for (i = 0; i < nr_cpus; i++) {
1205 group_fd = -1;
1206 for (counter = 0; counter < nr_counters; counter++) {
1207
1208 cpu = profile_cpu;
1209 if (tid == -1 && profile_cpu == -1)
1210 cpu = i;
1211
1212 memset(&hw_event, 0, sizeof(hw_event));
1213 hw_event.config = event_id[counter];
1214 hw_event.irq_period = event_count[counter];
1215 hw_event.record_type = PERF_RECORD_IRQ;
1216 hw_event.nmi = nmi;
1217
1218 fd[i][counter] = sys_perf_counter_open(&hw_event, tid, cpu, group_fd, 0);
1219 fcntl(fd[i][counter], F_SETFL, O_NONBLOCK);
1220 if (fd[i][counter] < 0) {
1221 printf("kerneltop error: syscall returned with %d (%s)\n",
1222 fd[i][counter], strerror(-fd[i][counter]));
1223 if (fd[i][counter] == -1)
1224 printf("Are you root?\n");
1225 exit(-1);
1226 }
1227 assert(fd[i][counter] >= 0);
1228
1229 /*
1230 * First counter acts as the group leader:
1231 */
1232 if (group && group_fd == -1)
1233 group_fd = fd[i][counter];
1234
1235 event_array[i][counter].fd = fd[i][counter];
1236 event_array[i][counter].events = POLLIN;
1237
1238 mmap_array[i][counter].counter = counter;
1239 mmap_array[i][counter].prev = 0;
1240 mmap_array[i][counter].mask = 2*page_size - 1;
1241 mmap_array[i][counter].base = mmap(NULL, 3*page_size,
1242 PROT_READ, MAP_SHARED, fd[i][counter], 0);
1243 if (mmap_array[i][counter].base == MAP_FAILED) {
1244 printf("kerneltop error: failed to mmap with %d (%s)\n",
1245 errno, strerror(errno));
1246 exit(-1);
1247 }
1248 }
1249 }
1250
1251 parse_symbols();
1252 if (vmlinux && sym_filter_entry)
1253 parse_vmlinux(vmlinux);
1254
1255 printf("KernelTop refresh period: %d seconds\n", delay_secs);
1256 last_refresh = time(NULL);
1257
1258 while (1) {
1259 int hits = events;
1260
1261 for (i = 0; i < nr_cpus; i++) {
1262 for (counter = 0; counter < nr_counters; counter++)
1263 mmap_read(&mmap_array[i][counter]);
1264 }
1265
1266 if (time(NULL) >= last_refresh + delay_secs) {
1267 print_sym_table();
1268 events = userspace_events = 0;
1269 }
1270
1271 if (hits == events)
1272 ret = poll(event_array[0], nr_cpus, 1000);
1273 hits = events;
1274 }
1275
1276 return 0;
1277 }
This page took 0.103634 seconds and 5 git commands to generate.