Initial port to perf counters

This commit is contained in:
Søren Sandmann Pedersen
2009-09-05 17:15:19 -04:00
parent a7a1ab3081
commit 594a58d674
5 changed files with 1290 additions and 199 deletions

33
barrier.h Normal file
View File

@ -0,0 +1,33 @@
#if defined(__i386__)
#define rmb() asm volatile("lock; addl $0,0(%%esp)" ::: "memory")
#define cpu_relax() asm volatile("rep; nop" ::: "memory");
#endif
#if defined(__x86_64__)
#define rmb() asm volatile("lfence" ::: "memory")
#define cpu_relax() asm volatile("rep; nop" ::: "memory");
#endif
#ifdef __powerpc__
#define rmb() asm volatile ("sync" ::: "memory")
#define cpu_relax() asm volatile ("" ::: "memory");
#endif
#ifdef __s390__
#define rmb() asm volatile("bcr 15,0" ::: "memory")
#define cpu_relax() asm volatile("" ::: "memory");
#endif
#ifdef __sh__
#if defined(__SH4A__) || defined(__SH5__)
# define rmb() asm volatile("synco" ::: "memory")
#else
# define rmb() asm volatile("" ::: "memory")
#endif
#define cpu_relax() asm volatile("" ::: "memory")
#endif
#ifdef __hppa__
#define rmb() asm volatile("" ::: "memory")
#define cpu_relax() asm volatile("" ::: "memory");
#endif

View File

@ -17,6 +17,17 @@
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#include <stdint.h>
#include <glib.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <string.h>
#include <sys/mman.h>
#include <stdlib.h>
#include <stdio.h>
#include <sys/ioctl.h>
#include "stackstash.h"
#include "collector.h"
#include "module/sysprof-module.h"
@ -24,17 +35,66 @@
#include "process.h"
#include "elfparser.h"
#include <errno.h>
#include <sys/wait.h>
#include <sys/types.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/mman.h>
#include "perf_counter.h"
#include "barrier.h"
#define SYSPROF_FILE "/dev/sysprof-trace"
#define N_PAGES 128 /* Number of pages in the ringbuffer */
static void set_no_module_error (GError **err);
static void set_cant_open_error (GError **err, int eno);
typedef struct counter_t counter_t;
typedef struct sample_event_t sample_event_t;
typedef struct mmap_event_t mmap_event_t;
typedef struct comm_event_t comm_event_t;
typedef union counter_event_t counter_event_t;
typedef void (* event_callback_t) (counter_event_t *event, gpointer data);
struct counter_t
{
int fd;
struct perf_counter_mmap_page * mmap_page;
uint8_t * data;
uint64_t tail;
int cpu;
event_callback_t callback;
gpointer user_data;
GString * partial;
};
struct sample_event_t
{
struct perf_event_header header;
uint64_t ip;
uint32_t pid, tid;
uint64_t n_ips;
uint64_t ips[1];
};
struct comm_event_t
{
struct perf_event_header header;
uint32_t pid, tid;
char comm[];
};
struct mmap_event_t
{
struct perf_event_header header;
uint32_t pid, tid;
uint64_t addr;
uint64_t pgoff;
char filename[1];
};
union counter_event_t
{
struct perf_event_header header;
mmap_event_t mmap;
comm_event_t comm;
sample_event_t sample;
};
struct Collector
{
@ -42,23 +102,256 @@ struct Collector
gpointer data;
StackStash * stash;
int fd;
GTimeVal latest_reset;
int n_samples;
SysprofMmapArea * map_area;
unsigned int current;
GList * counters;
};
static int
get_n_cpus (void)
{
return sysconf (_SC_NPROCESSORS_ONLN);
}
static int
sysprof_perf_counter_open (struct perf_counter_attr *attr,
pid_t pid,
int cpu,
int group_fd,
unsigned long flags)
{
attr->size = sizeof(*attr);
return syscall (__NR_perf_counter_open, attr, pid, cpu, group_fd, flags);
}
static int
process_event (counter_t *counter, void *data)
{
struct perf_event_header *header = data;
counter->callback ((counter_event_t *)header, counter->user_data);
return header->size;
}
static int
n_missing_bytes (const uint8_t *data, int n_bytes)
{
const struct perf_event_header *header;
if (n_bytes < sizeof (*header))
return sizeof (*header) - n_bytes;
header = (const void *)data;
if (n_bytes < header->size)
return header->size - n_bytes;
return 0;
}
static void
process_events (counter_t *counter, uint8_t *data, int n_bytes,
GString *partial)
{
int n_events;
ssize_t n_missing;
n_events = 0;
while (n_bytes && (n_missing = n_missing_bytes (
(uint8_t *)partial->str, partial->len)))
{
n_missing = MIN (n_bytes, n_missing);
g_string_append_len (partial, (const char *)data, n_missing);
data += n_missing;
n_bytes -= n_missing;
}
if (partial->len)
{
int n_used;
if (n_missing_bytes ((uint8_t *)partial->str, partial->len))
return;
n_used = process_event (counter, partial->str);
g_assert (n_used == partial->len);
g_string_truncate (partial, 0);
}
while (n_bytes && !n_missing_bytes (data, n_bytes))
{
int n_used = process_event (counter, data);
data += n_used;
n_bytes -= n_used;
}
if (n_missing_bytes (data, n_bytes))
g_string_append_len (partial, (char *)data, n_bytes);
}
static void
on_read (gpointer data)
{
uint64_t head, tail;
counter_t *counter = data;
int mask = (N_PAGES * process_get_page_size() - 1);
uint64_t size;
int diff;
tail = counter->tail;
head = counter->mmap_page->data_head;
rmb();
diff = head - tail;
if (diff < 0)
{
g_warning ("sysprof fails at reading the buffer\n");
tail = head;
}
size = head - tail;
if ((tail & mask) + size != (head & mask))
{
size = mask + 1 - (tail & mask);
g_assert ((tail & mask) + size <= (N_PAGES * process_get_page_size()));
process_events (counter, counter->data + (tail & mask),
size, counter->partial);
tail += size;
}
size = head - tail;
g_assert ((tail & mask) + size <= (N_PAGES * process_get_page_size()));
process_events (counter,
counter->data + (tail & mask), size, counter->partial);
tail += size;
counter->tail = tail;
counter->mmap_page->data_tail = tail;
}
#define fail(x)
static counter_t *
counter_new (int cpu,
event_callback_t callback,
gpointer data)
{
struct perf_counter_attr attr;
counter_t *counter;
int fd;
counter = g_new (counter_t, 1);
memset (&attr, 0, sizeof (attr));
attr.type = PERF_TYPE_HARDWARE;
attr.config = PERF_COUNT_HW_CPU_CYCLES;
attr.sample_period = 1200000 ; /* In number of clock cycles -
* use frequency instead FIXME
*/
attr.sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_CALLCHAIN;
attr.wakeup_events = 100000;
attr.disabled = TRUE;
attr.mmap = TRUE;
attr.comm = TRUE;
fd = sysprof_perf_counter_open (&attr, -1, cpu, -1, 0);
if (fd < 0)
{
fail ("perf_counter_open");
return NULL;
}
counter->fd = fd;
counter->mmap_page = mmap (
NULL, (N_PAGES + 1) * process_get_page_size(),
PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
if (counter->mmap_page == MAP_FAILED)
{
fail ("mmap");
return NULL;
}
counter->data = (uint8_t *)counter->mmap_page + process_get_page_size ();
counter->tail = 0;
counter->cpu = cpu;
counter->partial = g_string_new (NULL);
counter->callback = callback;
counter->user_data = data;
fd_add_watch (fd, counter);
fd_set_read_callback (fd, on_read);
return counter;
}
static void
counter_enable (counter_t *counter)
{
ioctl (counter->fd, PERF_COUNTER_IOC_ENABLE);
}
static void
counter_free (counter_t *counter)
{
munmap (counter->mmap_page, (N_PAGES + 1) * process_get_page_size());
fd_remove_watch (counter->fd);
close (counter->fd);
g_string_free (counter->partial, TRUE);
g_free (counter);
}
/*
* Collector
*/
void
collector_reset (Collector *collector)
{
if (collector->stash)
stack_stash_unref (collector->stash);
process_flush_caches();
collector->stash = stack_stash_new (NULL);
collector->n_samples = 0;
g_get_current_time (&collector->latest_reset);
}
/* callback is called whenever a new sample arrives */
Collector *
collector_new (CollectorFunc callback,
gpointer data)
gpointer data)
{
Collector *collector = g_new0 (Collector, 1);
collector->callback = callback;
collector->data = data;
collector->fd = -1;
collector->stash = NULL;
collector_reset (collector);
@ -88,9 +381,9 @@ static void
add_trace_to_stash (const SysprofStackTrace *trace,
StackStash *stash)
{
int i;
gulong *addrs;
Process *process = process_get_from_pid (trace->pid);
gulong *addrs;
int i;
int n_addresses;
int n_kernel_words;
int a;
@ -165,200 +458,183 @@ in_dead_period (Collector *collector)
return FALSE;
}
static void
process_mmap (Collector *collector,
mmap_event_t *mmap)
{
}
static void
collect_traces (Collector *collector)
process_comm (Collector *collector,
comm_event_t *comm)
{
gboolean first;
/* After a reset we ignore samples for a short period so that
* a reset will actually cause 'samples' to become 0
*/
if (in_dead_period (collector))
}
static gboolean
is_context (uint64_t addr)
{
return
addr == PERF_CONTEXT_HV ||
addr == PERF_CONTEXT_KERNEL ||
addr == PERF_CONTEXT_USER ||
addr == PERF_CONTEXT_GUEST ||
addr == PERF_CONTEXT_GUEST_KERNEL ||
addr == PERF_CONTEXT_GUEST_USER;
}
static void
process_sample (Collector *collector,
sample_event_t *sample)
{
Process *process = process_get_from_pid (sample->pid);
gboolean first = collector->n_samples == 0;
uint64_t context = 0;
gulong addrs_stack[2048];
gulong *addrs;
int n_alloc;
int i;
gulong *a;
n_alloc = sample->n_ips + 2;
if (n_alloc < 2048)
addrs = addrs_stack;
else
addrs = g_new (gulong, n_alloc);
a = addrs;
for (i = 0; i < sample->n_ips; ++i)
{
collector->current = collector->map_area->head;
return;
uint64_t addr = sample->ips[i];
if (is_context (addr))
{
/* FIXME: think this through */
if (context == PERF_CONTEXT_KERNEL)
*a++ = 0x01; /* kernel marker */
context = addr;
}
else
{
if (context == PERF_CONTEXT_KERNEL)
{
if (process_is_kernel_address (addr))
*a++ = addr;
}
else
{
if (!context)
g_print ("no context\n");
process_ensure_map (process, sample->pid, addr);
*a++ = addr;
}
}
}
first = collector->n_samples == 0;
*a++ = (gulong)process;
while (collector->current != collector->map_area->head)
{
const SysprofStackTrace *trace;
stack_stash_add_trace (collector->stash, addrs, a - addrs, 1);
collector->n_samples++;
trace = &(collector->map_area->traces[collector->current]);
#if 0
{
int i;
g_print ("pid: %d (%d)\n", trace->pid, trace->n_addresses);
for (i=0; i < trace->n_addresses; ++i)
g_print ("rd: %08x\n", trace->addresses[i]);
g_print ("-=-\n");
}
#endif
#if 0
{
int i;
g_print ("pid: %d (%d)\n", trace->pid, trace->n_addresses);
for (i=0; i < trace->n_kernel_words; ++i)
g_print ("rd: %08x\n", trace->kernel_stack[i]);
g_print ("-=-\n");
}
#endif
add_trace_to_stash (trace, collector->stash);
collector->current++;
if (collector->current >= SYSPROF_N_TRACES)
collector->current = 0;
collector->n_samples++;
}
if (collector->callback)
collector->callback (first, collector->data);
if (addrs != addrs_stack)
g_free (addrs);
}
static void
on_read (gpointer data)
on_event (counter_event_t * event,
gpointer data)
{
Collector *collector = data;
char c;
/* Make sure poll() doesn't fire immediately again */
read (collector->fd, &c, 1);
collect_traces (collector);
}
static gboolean
load_module (void)
{
int exit_status = -1;
char *dummy1, *dummy2;
if (g_spawn_command_line_sync ("/sbin/modprobe sysprof-module",
&dummy1, &dummy2,
&exit_status,
NULL))
switch (event->header.type)
{
if (WIFEXITED (exit_status))
exit_status = WEXITSTATUS (exit_status);
case PERF_EVENT_MMAP:
process_mmap (collector, &event->mmap);
break;
g_free (dummy1);
g_free (dummy2);
}
return (exit_status == 0);
}
static gboolean
open_fd (Collector *collector,
GError **err)
{
int fd;
void *map_area;
fd = open (SYSPROF_FILE, O_RDONLY);
if (fd < 0)
{
if (load_module())
{
GTimer *timer = g_timer_new ();
while (fd < 0 && g_timer_elapsed (timer, NULL) < 0.5)
{
/* Wait for udev to discover the new device.
*/
usleep (100000);
errno = 0;
fd = open (SYSPROF_FILE, O_RDONLY);
}
g_timer_destroy (timer);
if (fd < 0)
{
set_cant_open_error (err, errno);
return FALSE;
}
}
case PERF_EVENT_LOST:
break;
if (fd < 0)
{
set_no_module_error (err);
return FALSE;
}
}
map_area = mmap (NULL, sizeof (SysprofMmapArea),
PROT_READ, MAP_SHARED, fd, 0);
if (map_area == MAP_FAILED)
{
close (fd);
set_cant_open_error (err, errno);
case PERF_EVENT_COMM:
process_comm (collector, &event->comm);
break;
return FALSE;
case PERF_EVENT_EXIT:
break;
case PERF_EVENT_THROTTLE:
break;
case PERF_EVENT_UNTHROTTLE:
break;
case PERF_EVENT_FORK:
break;
case PERF_EVENT_READ:
break;
case PERF_EVENT_SAMPLE:
process_sample (collector, &event->sample);
break;
default:
g_print ("unknown event: %d (%d)\n",
event->header.type, event->header.size);
break;
}
collector->map_area = map_area;
collector->current = 0;
collector->fd = fd;
fd_add_watch (collector->fd, collector);
return TRUE;
}
gboolean
collector_start (Collector *collector,
GError **err)
{
if (collector->fd < 0 && !open_fd (collector, err))
return FALSE;
int n_cpus = get_n_cpus ();
GList *list;
int i;
for (i = 0; i < n_cpus; ++i)
{
counter_t *counter = counter_new (i, on_event, collector);
collector->counters = g_list_append (collector->counters, counter);
}
/* Hack to make sure we parse the kernel symbols before
* starting collection, so the parsing doesn't interfere
* with the profiling.
*/
process_is_kernel_address (0);
for (list = collector->counters; list != NULL; list = list->next)
counter_enable (list->data);
fd_set_read_callback (collector->fd, on_read);
return TRUE;
}
void
collector_stop (Collector *collector)
{
if (collector->fd >= 0)
{
fd_remove_watch (collector->fd);
munmap (collector->map_area, sizeof (SysprofMmapArea));
collector->map_area = NULL;
collector->current = 0;
close (collector->fd);
collector->fd = -1;
}
}
GList *list;
void
collector_reset (Collector *collector)
{
if (collector->stash)
stack_stash_unref (collector->stash);
process_flush_caches();
collector->stash = stack_stash_new (NULL);
collector->n_samples = 0;
g_get_current_time (&collector->latest_reset);
for (list = collector->counters; list != NULL; list = list->next)
{
counter_t *counter = list->data;
counter_free (counter);
}
g_list_free (collector->counters);
collector->counters = NULL;
}
int
@ -464,7 +740,7 @@ lookup_symbol (Process *process, gpointer address,
static void
resolve_symbols (GList *trace, gint size, gpointer data)
{
static const char *const everything = "Everything";
static const char *const everything = "[Everything]";
GList *list;
ResolveInfo *info = data;
Process *process = g_list_last (trace)->data;
@ -538,31 +814,6 @@ collector_create_profile (Collector *collector)
return profile;
}
static void
set_no_module_error (GError **err)
{
g_set_error (err,
COLLECTOR_ERROR,
COLLECTOR_ERROR_CANT_OPEN_FILE,
"Can't open " SYSPROF_FILE ". You need to insert "
"the sysprof kernel module. Run\n"
"\n"
" modprobe sysprof-module\n"
"\n"
"as root");
}
static void
set_cant_open_error (GError **err,
int eno)
{
g_set_error (err,
COLLECTOR_ERROR,
COLLECTOR_ERROR_CANT_OPEN_FILE,
"Can't open " SYSPROF_FILE ": %s",
g_strerror (eno));
}
GQuark
collector_error_quark (void)
{

804
perf_counter.h Normal file
View File

@ -0,0 +1,804 @@
/*
* Performance counters:
*
* Copyright (C) 2008-2009, Thomas Gleixner <tglx@linutronix.de>
* Copyright (C) 2008-2009, Red Hat, Inc., Ingo Molnar
* Copyright (C) 2008-2009, Red Hat, Inc., Peter Zijlstra
*
* Data type definitions, declarations, prototypes.
*
* Started by: Thomas Gleixner and Ingo Molnar
*
* For licencing details see kernel-base/COPYING
*/
#ifndef _LINUX_PERF_COUNTER_H
#define _LINUX_PERF_COUNTER_H
#include <linux/types.h>
#include <linux/ioctl.h>
#include <asm/byteorder.h>
/*
* User-space ABI bits:
*/
/*
* attr.type
*/
enum perf_type_id {
PERF_TYPE_HARDWARE = 0,
PERF_TYPE_SOFTWARE = 1,
PERF_TYPE_TRACEPOINT = 2,
PERF_TYPE_HW_CACHE = 3,
PERF_TYPE_RAW = 4,
PERF_TYPE_MAX, /* non-ABI */
};
/*
* Generalized performance counter event types, used by the
* attr.event_id parameter of the sys_perf_counter_open()
* syscall:
*/
enum perf_hw_id {
/*
* Common hardware events, generalized by the kernel:
*/
PERF_COUNT_HW_CPU_CYCLES = 0,
PERF_COUNT_HW_INSTRUCTIONS = 1,
PERF_COUNT_HW_CACHE_REFERENCES = 2,
PERF_COUNT_HW_CACHE_MISSES = 3,
PERF_COUNT_HW_BRANCH_INSTRUCTIONS = 4,
PERF_COUNT_HW_BRANCH_MISSES = 5,
PERF_COUNT_HW_BUS_CYCLES = 6,
PERF_COUNT_HW_MAX, /* non-ABI */
};
/*
* Generalized hardware cache counters:
*
* { L1-D, L1-I, LLC, ITLB, DTLB, BPU } x
* { read, write, prefetch } x
* { accesses, misses }
*/
enum perf_hw_cache_id {
PERF_COUNT_HW_CACHE_L1D = 0,
PERF_COUNT_HW_CACHE_L1I = 1,
PERF_COUNT_HW_CACHE_LL = 2,
PERF_COUNT_HW_CACHE_DTLB = 3,
PERF_COUNT_HW_CACHE_ITLB = 4,
PERF_COUNT_HW_CACHE_BPU = 5,
PERF_COUNT_HW_CACHE_MAX, /* non-ABI */
};
enum perf_hw_cache_op_id {
PERF_COUNT_HW_CACHE_OP_READ = 0,
PERF_COUNT_HW_CACHE_OP_WRITE = 1,
PERF_COUNT_HW_CACHE_OP_PREFETCH = 2,
PERF_COUNT_HW_CACHE_OP_MAX, /* non-ABI */
};
enum perf_hw_cache_op_result_id {
PERF_COUNT_HW_CACHE_RESULT_ACCESS = 0,
PERF_COUNT_HW_CACHE_RESULT_MISS = 1,
PERF_COUNT_HW_CACHE_RESULT_MAX, /* non-ABI */
};
/*
* Special "software" counters provided by the kernel, even if the hardware
* does not support performance counters. These counters measure various
* physical and sw events of the kernel (and allow the profiling of them as
* well):
*/
enum perf_sw_ids {
PERF_COUNT_SW_CPU_CLOCK = 0,
PERF_COUNT_SW_TASK_CLOCK = 1,
PERF_COUNT_SW_PAGE_FAULTS = 2,
PERF_COUNT_SW_CONTEXT_SWITCHES = 3,
PERF_COUNT_SW_CPU_MIGRATIONS = 4,
PERF_COUNT_SW_PAGE_FAULTS_MIN = 5,
PERF_COUNT_SW_PAGE_FAULTS_MAJ = 6,
PERF_COUNT_SW_MAX, /* non-ABI */
};
/*
* Bits that can be set in attr.sample_type to request information
* in the overflow packets.
*/
enum perf_counter_sample_format {
PERF_SAMPLE_IP = 1U << 0,
PERF_SAMPLE_TID = 1U << 1,
PERF_SAMPLE_TIME = 1U << 2,
PERF_SAMPLE_ADDR = 1U << 3,
PERF_SAMPLE_READ = 1U << 4,
PERF_SAMPLE_CALLCHAIN = 1U << 5,
PERF_SAMPLE_ID = 1U << 6,
PERF_SAMPLE_CPU = 1U << 7,
PERF_SAMPLE_PERIOD = 1U << 8,
PERF_SAMPLE_STREAM_ID = 1U << 9,
PERF_SAMPLE_RAW = 1U << 10,
PERF_SAMPLE_MAX = 1U << 11, /* non-ABI */
};
/*
* The format of the data returned by read() on a perf counter fd,
* as specified by attr.read_format:
*
* struct read_format {
* { u64 value;
* { u64 time_enabled; } && PERF_FORMAT_ENABLED
* { u64 time_running; } && PERF_FORMAT_RUNNING
* { u64 id; } && PERF_FORMAT_ID
* } && !PERF_FORMAT_GROUP
*
* { u64 nr;
* { u64 time_enabled; } && PERF_FORMAT_ENABLED
* { u64 time_running; } && PERF_FORMAT_RUNNING
* { u64 value;
* { u64 id; } && PERF_FORMAT_ID
* } cntr[nr];
* } && PERF_FORMAT_GROUP
* };
*/
enum perf_counter_read_format {
PERF_FORMAT_TOTAL_TIME_ENABLED = 1U << 0,
PERF_FORMAT_TOTAL_TIME_RUNNING = 1U << 1,
PERF_FORMAT_ID = 1U << 2,
PERF_FORMAT_GROUP = 1U << 3,
PERF_FORMAT_MAX = 1U << 4, /* non-ABI */
};
#define PERF_ATTR_SIZE_VER0 64 /* sizeof first published struct */
/*
* Hardware event to monitor via a performance monitoring counter:
*/
struct perf_counter_attr {
/*
* Major type: hardware/software/tracepoint/etc.
*/
__u32 type;
/*
* Size of the attr structure, for fwd/bwd compat.
*/
__u32 size;
/*
* Type specific configuration information.
*/
__u64 config;
union {
__u64 sample_period;
__u64 sample_freq;
};
__u64 sample_type;
__u64 read_format;
__u64 disabled : 1, /* off by default */
inherit : 1, /* children inherit it */
pinned : 1, /* must always be on PMU */
exclusive : 1, /* only group on PMU */
exclude_user : 1, /* don't count user */
exclude_kernel : 1, /* ditto kernel */
exclude_hv : 1, /* ditto hypervisor */
exclude_idle : 1, /* don't count when idle */
mmap : 1, /* include mmap data */
comm : 1, /* include comm data */
freq : 1, /* use freq, not period */
inherit_stat : 1, /* per task counts */
enable_on_exec : 1, /* next exec enables */
task : 1, /* trace fork/exit */
__reserved_1 : 50;
__u32 wakeup_events; /* wakeup every n events */
__u32 __reserved_2;
__u64 __reserved_3;
};
/*
* Ioctls that can be done on a perf counter fd:
*/
#define PERF_COUNTER_IOC_ENABLE _IO ('$', 0)
#define PERF_COUNTER_IOC_DISABLE _IO ('$', 1)
#define PERF_COUNTER_IOC_REFRESH _IO ('$', 2)
#define PERF_COUNTER_IOC_RESET _IO ('$', 3)
#define PERF_COUNTER_IOC_PERIOD _IOW('$', 4, u64)
enum perf_counter_ioc_flags {
PERF_IOC_FLAG_GROUP = 1U << 0,
};
/*
* Structure of the page that can be mapped via mmap
*/
struct perf_counter_mmap_page {
__u32 version; /* version number of this structure */
__u32 compat_version; /* lowest version this is compat with */
/*
* Bits needed to read the hw counters in user-space.
*
* u32 seq;
* s64 count;
*
* do {
* seq = pc->lock;
*
* barrier()
* if (pc->index) {
* count = pmc_read(pc->index - 1);
* count += pc->offset;
* } else
* goto regular_read;
*
* barrier();
* } while (pc->lock != seq);
*
* NOTE: for obvious reason this only works on self-monitoring
* processes.
*/
__u32 lock; /* seqlock for synchronization */
__u32 index; /* hardware counter identifier */
__s64 offset; /* add to hardware counter value */
__u64 time_enabled; /* time counter active */
__u64 time_running; /* time counter on cpu */
/*
* Hole for extension of the self monitor capabilities
*/
__u64 __reserved[123]; /* align to 1k */
/*
* Control data for the mmap() data buffer.
*
* User-space reading the @data_head value should issue an rmb(), on
* SMP capable platforms, after reading this value -- see
* perf_counter_wakeup().
*
* When the mapping is PROT_WRITE the @data_tail value should be
* written by userspace to reflect the last read data. In this case
* the kernel will not over-write unread data.
*/
__u64 data_head; /* head in the data section */
__u64 data_tail; /* user-space written tail */
};
#define PERF_EVENT_MISC_CPUMODE_MASK (3 << 0)
#define PERF_EVENT_MISC_CPUMODE_UNKNOWN (0 << 0)
#define PERF_EVENT_MISC_KERNEL (1 << 0)
#define PERF_EVENT_MISC_USER (2 << 0)
#define PERF_EVENT_MISC_HYPERVISOR (3 << 0)
struct perf_event_header {
__u32 type;
__u16 misc;
__u16 size;
};
enum perf_event_type {
/*
* The MMAP events record the PROT_EXEC mappings so that we can
* correlate userspace IPs to code. They have the following structure:
*
* struct {
* struct perf_event_header header;
*
* u32 pid, tid;
* u64 addr;
* u64 len;
* u64 pgoff;
* char filename[];
* };
*/
PERF_EVENT_MMAP = 1,
/*
* struct {
* struct perf_event_header header;
* u64 id;
* u64 lost;
* };
*/
PERF_EVENT_LOST = 2,
/*
* struct {
* struct perf_event_header header;
*
* u32 pid, tid;
* char comm[];
* };
*/
PERF_EVENT_COMM = 3,
/*
* struct {
* struct perf_event_header header;
* u32 pid, ppid;
* u32 tid, ptid;
* };
*/
PERF_EVENT_EXIT = 4,
/*
* struct {
* struct perf_event_header header;
* u64 time;
* u64 id;
* u64 stream_id;
* };
*/
PERF_EVENT_THROTTLE = 5,
PERF_EVENT_UNTHROTTLE = 6,
/*
* struct {
* struct perf_event_header header;
* u32 pid, ppid;
* u32 tid, ptid;
* };
*/
PERF_EVENT_FORK = 7,
/*
* struct {
* struct perf_event_header header;
* u32 pid, tid;
*
* struct read_format values;
* };
*/
PERF_EVENT_READ = 8,
/*
* struct {
* struct perf_event_header header;
*
* { u64 ip; } && PERF_SAMPLE_IP
* { u32 pid, tid; } && PERF_SAMPLE_TID
* { u64 time; } && PERF_SAMPLE_TIME
* { u64 addr; } && PERF_SAMPLE_ADDR
* { u64 id; } && PERF_SAMPLE_ID
* { u64 stream_id;} && PERF_SAMPLE_STREAM_ID
* { u32 cpu, res; } && PERF_SAMPLE_CPU
* { u64 period; } && PERF_SAMPLE_PERIOD
*
* { struct read_format values; } && PERF_SAMPLE_READ
*
* { u64 nr,
* u64 ips[nr]; } && PERF_SAMPLE_CALLCHAIN
*
* #
* # The RAW record below is opaque data wrt the ABI
* #
* # That is, the ABI doesn't make any promises wrt to
* # the stability of its content, it may vary depending
* # on event, hardware, kernel version and phase of
* # the moon.
* #
* # In other words, PERF_SAMPLE_RAW contents are not an ABI.
* #
*
* { u32 size;
* char data[size];}&& PERF_SAMPLE_RAW
* };
*/
PERF_EVENT_SAMPLE = 9,
PERF_EVENT_MAX, /* non-ABI */
};
enum perf_callchain_context {
PERF_CONTEXT_HV = (__u64)-32,
PERF_CONTEXT_KERNEL = (__u64)-128,
PERF_CONTEXT_USER = (__u64)-512,
PERF_CONTEXT_GUEST = (__u64)-2048,
PERF_CONTEXT_GUEST_KERNEL = (__u64)-2176,
PERF_CONTEXT_GUEST_USER = (__u64)-2560,
PERF_CONTEXT_MAX = (__u64)-4095,
};
#ifdef __KERNEL__
/*
* Kernel-internal data types and definitions:
*/
#ifdef CONFIG_PERF_COUNTERS
# include <asm/perf_counter.h>
#endif
#include <linux/list.h>
#include <linux/mutex.h>
#include <linux/rculist.h>
#include <linux/rcupdate.h>
#include <linux/spinlock.h>
#include <linux/hrtimer.h>
#include <linux/fs.h>
#include <linux/pid_namespace.h>
#include <asm/atomic.h>
#define PERF_MAX_STACK_DEPTH 255
struct perf_callchain_entry {
__u64 nr;
__u64 ip[PERF_MAX_STACK_DEPTH];
};
struct perf_raw_record {
u32 size;
void *data;
};
struct task_struct;
/**
* struct hw_perf_counter - performance counter hardware details:
*/
struct hw_perf_counter {
#ifdef CONFIG_PERF_COUNTERS
union {
struct { /* hardware */
u64 config;
unsigned long config_base;
unsigned long counter_base;
int idx;
};
union { /* software */
atomic64_t count;
struct hrtimer hrtimer;
};
};
atomic64_t prev_count;
u64 sample_period;
u64 last_period;
atomic64_t period_left;
u64 interrupts;
u64 freq_count;
u64 freq_interrupts;
u64 freq_stamp;
#endif
};
struct perf_counter;
/**
* struct pmu - generic performance monitoring unit
*/
struct pmu {
int (*enable) (struct perf_counter *counter);
void (*disable) (struct perf_counter *counter);
void (*read) (struct perf_counter *counter);
void (*unthrottle) (struct perf_counter *counter);
};
/**
* enum perf_counter_active_state - the states of a counter
*/
enum perf_counter_active_state {
PERF_COUNTER_STATE_ERROR = -2,
PERF_COUNTER_STATE_OFF = -1,
PERF_COUNTER_STATE_INACTIVE = 0,
PERF_COUNTER_STATE_ACTIVE = 1,
};
struct file;
struct perf_mmap_data {
struct rcu_head rcu_head;
int nr_pages; /* nr of data pages */
int writable; /* are we writable */
int nr_locked; /* nr pages mlocked */
atomic_t poll; /* POLL_ for wakeups */
atomic_t events; /* event limit */
atomic_long_t head; /* write position */
atomic_long_t done_head; /* completed head */
atomic_t lock; /* concurrent writes */
atomic_t wakeup; /* needs a wakeup */
atomic_t lost; /* nr records lost */
struct perf_counter_mmap_page *user_page;
void *data_pages[0];
};
struct perf_pending_entry {
struct perf_pending_entry *next;
void (*func)(struct perf_pending_entry *);
};
/**
* struct perf_counter - performance counter kernel representation:
*/
struct perf_counter {
#ifdef CONFIG_PERF_COUNTERS
struct list_head list_entry;
struct list_head event_entry;
struct list_head sibling_list;
int nr_siblings;
struct perf_counter *group_leader;
const struct pmu *pmu;
enum perf_counter_active_state state;
atomic64_t count;
/*
* These are the total time in nanoseconds that the counter
* has been enabled (i.e. eligible to run, and the task has
* been scheduled in, if this is a per-task counter)
* and running (scheduled onto the CPU), respectively.
*
* They are computed from tstamp_enabled, tstamp_running and
* tstamp_stopped when the counter is in INACTIVE or ACTIVE state.
*/
u64 total_time_enabled;
u64 total_time_running;
/*
* These are timestamps used for computing total_time_enabled
* and total_time_running when the counter is in INACTIVE or
* ACTIVE state, measured in nanoseconds from an arbitrary point
* in time.
* tstamp_enabled: the notional time when the counter was enabled
* tstamp_running: the notional time when the counter was scheduled on
* tstamp_stopped: in INACTIVE state, the notional time when the
* counter was scheduled off.
*/
u64 tstamp_enabled;
u64 tstamp_running;
u64 tstamp_stopped;
struct perf_counter_attr attr;
struct hw_perf_counter hw;
struct perf_counter_context *ctx;
struct file *filp;
/*
* These accumulate total time (in nanoseconds) that children
* counters have been enabled and running, respectively.
*/
atomic64_t child_total_time_enabled;
atomic64_t child_total_time_running;
/*
* Protect attach/detach and child_list:
*/
struct mutex child_mutex;
struct list_head child_list;
struct perf_counter *parent;
int oncpu;
int cpu;
struct list_head owner_entry;
struct task_struct *owner;
/* mmap bits */
struct mutex mmap_mutex;
atomic_t mmap_count;
struct perf_mmap_data *data;
/* poll related */
wait_queue_head_t waitq;
struct fasync_struct *fasync;
/* delayed work for NMIs and such */
int pending_wakeup;
int pending_kill;
int pending_disable;
struct perf_pending_entry pending;
atomic_t event_limit;
void (*destroy)(struct perf_counter *);
struct rcu_head rcu_head;
struct pid_namespace *ns;
u64 id;
#endif
};
/**
* struct perf_counter_context - counter context structure
*
* Used as a container for task counters and CPU counters as well:
*/
struct perf_counter_context {
/*
* Protect the states of the counters in the list,
* nr_active, and the list:
*/
spinlock_t lock;
/*
* Protect the list of counters. Locking either mutex or lock
* is sufficient to ensure the list doesn't change; to change
* the list you need to lock both the mutex and the spinlock.
*/
struct mutex mutex;
struct list_head counter_list;
struct list_head event_list;
int nr_counters;
int nr_active;
int is_active;
int nr_stat;
atomic_t refcount;
struct task_struct *task;
/*
* Context clock, runs when context enabled.
*/
u64 time;
u64 timestamp;
/*
* These fields let us detect when two contexts have both
* been cloned (inherited) from a common ancestor.
*/
struct perf_counter_context *parent_ctx;
u64 parent_gen;
u64 generation;
int pin_count;
struct rcu_head rcu_head;
};
/**
* struct perf_counter_cpu_context - per cpu counter context structure
*/
struct perf_cpu_context {
struct perf_counter_context ctx;
struct perf_counter_context *task_ctx;
int active_oncpu;
int max_pertask;
int exclusive;
/*
* Recursion avoidance:
*
* task, softirq, irq, nmi context
*/
int recursion[4];
};
#ifdef CONFIG_PERF_COUNTERS
/*
* Set by architecture code:
*/
extern int perf_max_counters;
extern const struct pmu *hw_perf_counter_init(struct perf_counter *counter);
extern void perf_counter_task_sched_in(struct task_struct *task, int cpu);
extern void perf_counter_task_sched_out(struct task_struct *task,
struct task_struct *next, int cpu);
extern void perf_counter_task_tick(struct task_struct *task, int cpu);
extern int perf_counter_init_task(struct task_struct *child);
extern void perf_counter_exit_task(struct task_struct *child);
extern void perf_counter_free_task(struct task_struct *task);
extern void set_perf_counter_pending(void);
extern void perf_counter_do_pending(void);
extern void perf_counter_print_debug(void);
extern void __perf_disable(void);
extern bool __perf_enable(void);
extern void perf_disable(void);
extern void perf_enable(void);
extern int perf_counter_task_disable(void);
extern int perf_counter_task_enable(void);
extern int hw_perf_group_sched_in(struct perf_counter *group_leader,
struct perf_cpu_context *cpuctx,
struct perf_counter_context *ctx, int cpu);
extern void perf_counter_update_userpage(struct perf_counter *counter);
struct perf_sample_data {
struct pt_regs *regs;
u64 addr;
u64 period;
struct perf_raw_record *raw;
};
extern int perf_counter_overflow(struct perf_counter *counter, int nmi,
struct perf_sample_data *data);
extern void perf_counter_output(struct perf_counter *counter, int nmi,
struct perf_sample_data *data);
/*
* Return 1 for a software counter, 0 for a hardware counter
*/
static inline int is_software_counter(struct perf_counter *counter)
{
return (counter->attr.type != PERF_TYPE_RAW) &&
(counter->attr.type != PERF_TYPE_HARDWARE) &&
(counter->attr.type != PERF_TYPE_HW_CACHE);
}
extern atomic_t perf_swcounter_enabled[PERF_COUNT_SW_MAX];
extern void __perf_swcounter_event(u32, u64, int, struct pt_regs *, u64);
static inline void
perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr)
{
if (atomic_read(&perf_swcounter_enabled[event]))
__perf_swcounter_event(event, nr, nmi, regs, addr);
}
extern void __perf_counter_mmap(struct vm_area_struct *vma);
static inline void perf_counter_mmap(struct vm_area_struct *vma)
{
if (vma->vm_flags & VM_EXEC)
__perf_counter_mmap(vma);
}
extern void perf_counter_comm(struct task_struct *tsk);
extern void perf_counter_fork(struct task_struct *tsk);
extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
extern int sysctl_perf_counter_paranoid;
extern int sysctl_perf_counter_mlock;
extern int sysctl_perf_counter_sample_rate;
extern void perf_counter_init(void);
#ifndef perf_misc_flags
#define perf_misc_flags(regs) (user_mode(regs) ? PERF_EVENT_MISC_USER : \
PERF_EVENT_MISC_KERNEL)
#define perf_instruction_pointer(regs) instruction_pointer(regs)
#endif
#else
static inline void
perf_counter_task_sched_in(struct task_struct *task, int cpu) { }
static inline void
perf_counter_task_sched_out(struct task_struct *task,
struct task_struct *next, int cpu) { }
static inline void
perf_counter_task_tick(struct task_struct *task, int cpu) { }
static inline int perf_counter_init_task(struct task_struct *child) { return 0; }
static inline void perf_counter_exit_task(struct task_struct *child) { }
static inline void perf_counter_free_task(struct task_struct *task) { }
static inline void perf_counter_do_pending(void) { }
static inline void perf_counter_print_debug(void) { }
static inline void perf_disable(void) { }
static inline void perf_enable(void) { }
static inline int perf_counter_task_disable(void) { return -EINVAL; }
static inline int perf_counter_task_enable(void) { return -EINVAL; }
static inline void
perf_swcounter_event(u32 event, u64 nr, int nmi,
struct pt_regs *regs, u64 addr) { }
static inline void perf_counter_mmap(struct vm_area_struct *vma) { }
static inline void perf_counter_comm(struct task_struct *tsk) { }
static inline void perf_counter_fork(struct task_struct *tsk) { }
static inline void perf_counter_init(void) { }
#endif
#endif /* __KERNEL__ */
#endif /* _LINUX_PERF_COUNTER_H */

View File

@ -296,8 +296,8 @@ process_has_page (Process *process, gulong addr)
return FALSE;
}
static int
page_size (void)
int
process_get_page_size (void)
{
static int page_size;
static gboolean has_page_size = FALSE;
@ -316,7 +316,7 @@ process_ensure_map (Process *process, int pid, gulong addr)
{
/* Round down to closest page */
addr = (addr - addr % page_size());
addr = (addr - addr % process_get_page_size());
if (process_has_page (process, addr))
return;
@ -678,7 +678,7 @@ process_lookup_kernel_symbol (gulong address,
const char *
process_lookup_symbol (Process *process, gulong address, gulong *offset)
{
static const char *const kernel = "kernel";
static const char *const kernel = "[kernel]";
const BinSymbol *result;
Map *map = process_locate_map (process, address);
@ -756,7 +756,9 @@ process_lookup_symbol (Process *process, gulong address, gulong *offset)
g_print (" ---> %s\n", result->name);
#endif
/* g_print ("(%x) %x %x name; %s\n", address, map->start, map->offset, result->name); */
/* g_print ("(%x) %x %x name; %s\n", address, map->start,
* map->offset, result->name);
*/
#if 0
g_print ("name: %s (in %s)\n", bin_symbol_get_name (map->bin_file, result), map->filename);

View File

@ -60,5 +60,6 @@ const guint8 *process_get_vdso_bytes (gsize *length);
gboolean process_is_kernel_address (gulong address);
const char * process_lookup_kernel_symbol (gulong address,
gulong *offset);
int process_get_page_size (void);
#endif