Add enhanced vfsreadlat BCC example with live plotly and dash graphs on browser

This commit is contained in:
Pragyansh Chaturvedi
2025-10-21 05:36:59 +05:30
parent e98d5684ea
commit 798f07986a
3 changed files with 479 additions and 0 deletions

View File

@ -0,0 +1,101 @@
"""BPF program for tracing VFS read latency."""
from pythonbpf import bpf, map, struct, section, bpfglobal, BPF
from pythonbpf.helper import ktime, pid
from pythonbpf.maps import HashMap, PerfEventArray
from ctypes import c_void_p, c_uint64
import argparse
from data_collector import LatencyCollector
from dashboard import LatencyDashboard
@bpf
@struct
class latency_event:
pid: c_uint64
delta_us: c_uint64
@bpf
@map
def start() -> HashMap:
"""Map to store start timestamps by PID."""
return HashMap(key=c_uint64, value=c_uint64, max_entries=10240)
@bpf
@map
def events() -> PerfEventArray:
"""Perf event array for sending latency events to userspace."""
return PerfEventArray(key_size=c_uint64, value_size=c_uint64)
@bpf
@section("kprobe/vfs_read")
def do_entry(ctx: c_void_p) -> c_uint64:
"""Record start time when vfs_read is called."""
p, ts = pid(), ktime()
start.update(p, ts)
return 0 # type: ignore [return-value]
@bpf
@section("kretprobe/vfs_read")
def do_return(ctx: c_void_p) -> c_uint64:
"""Calculate and record latency when vfs_read returns."""
p = pid()
tsp = start.lookup(p)
if tsp:
delta_ns = ktime() - tsp
# Only track latencies > 1 microsecond
if delta_ns > 1000:
evt = latency_event()
evt.pid, evt.delta_us = p, delta_ns // 1000
events.output(evt)
start.delete(p)
return 0 # type: ignore [return-value]
@bpf
@bpfglobal
def LICENSE() -> str:
return "GPL"
def parse_args():
"""Parse command line arguments."""
parser = argparse.ArgumentParser(
description="Monitor VFS read latency with live dashboard"
)
parser.add_argument(
"--host", default="0.0.0.0", help="Dashboard host (default: 0.0.0.0)"
)
parser.add_argument(
"--port", type=int, default=8050, help="Dashboard port (default: 8050)"
)
parser.add_argument(
"--buffer", type=int, default=10000, help="Recent data buffer size"
)
return parser.parse_args()
args = parse_args()
# Load BPF program
print("Loading BPF program...")
b = BPF()
b.load()
b.attach_all()
print("✅ BPF program loaded and attached")
# Setup data collector
collector = LatencyCollector(b, buffer_size=args.buffer)
collector.start()
# Create and run dashboard
dashboard = LatencyDashboard(collector)
dashboard.run(host=args.host, port=args.port)

View File

@ -0,0 +1,282 @@
"""Plotly Dash dashboard for visualizing latency data."""
import dash
from dash import dcc, html
from dash.dependencies import Input, Output
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
class LatencyDashboard:
"""Interactive dashboard for latency visualization."""
def __init__(self, collector, title: str = "VFS Read Latency Monitor"):
self.collector = collector
self.app = dash.Dash(__name__)
self.app.title = title
self._setup_layout()
self._setup_callbacks()
def _setup_layout(self):
"""Create dashboard layout."""
self.app.layout = html.Div(
[
html.H1(
"🔥 VFS Read Latency Dashboard",
style={
"textAlign": "center",
"color": "#2c3e50",
"marginBottom": 20,
},
),
# Stats cards
html.Div(
[
self._create_stat_card(
"total-samples", "📊 Total Samples", "#3498db"
),
self._create_stat_card(
"mean-latency", "⚡ Mean Latency", "#e74c3c"
),
self._create_stat_card(
"p99-latency", "🔥 P99 Latency", "#f39c12"
),
],
style={
"display": "flex",
"justifyContent": "space-around",
"marginBottom": 30,
},
),
# Graphs - ✅ Make sure these IDs match the callback outputs
dcc.Graph(id="dual-histogram", style={"height": "450px"}),
dcc.Graph(id="log2-buckets", style={"height": "350px"}),
dcc.Graph(id="timeseries-graph", style={"height": "300px"}),
# Auto-update
dcc.Interval(id="interval-component", interval=1000, n_intervals=0),
],
style={"padding": 20, "fontFamily": "Arial, sans-serif"},
)
def _create_stat_card(self, id_name: str, title: str, color: str):
"""Create a statistics card."""
return html.Div(
[
html.H3(title, style={"color": color}),
html.H2(id=id_name, style={"fontSize": 48, "color": "#2c3e50"}),
],
className="stat-box",
style={
"background": "white",
"padding": 20,
"borderRadius": 10,
"boxShadow": "0 4px 6px rgba(0,0,0,0.1)",
"textAlign": "center",
"flex": 1,
"margin": "0 10px",
},
)
def _setup_callbacks(self):
"""Setup dashboard callbacks."""
@self.app.callback(
[
Output("total-samples", "children"),
Output("mean-latency", "children"),
Output("p99-latency", "children"),
Output("dual-histogram", "figure"), # ✅ Match layout IDs
Output("log2-buckets", "figure"), # ✅ Match layout IDs
Output("timeseries-graph", "figure"), # ✅ Match layout IDs
],
[Input("interval-component", "n_intervals")],
)
def update_dashboard(n):
stats = self.collector.get_stats()
if stats.total == 0:
return self._empty_state()
return (
f"{stats.total:,}",
f"{stats.mean:.1f} µs",
f"{stats.p99:.1f} µs",
self._create_dual_histogram(),
self._create_log2_buckets(),
self._create_timeseries(),
)
def _empty_state(self):
"""Return empty state for dashboard."""
empty_fig = go.Figure()
empty_fig.update_layout(
title="Waiting for data... Generate some disk I/O!", template="plotly_white"
)
# ✅ Return 6 values (3 stats + 3 figures)
return "0", "0 µs", "0 µs", empty_fig, empty_fig, empty_fig
def _create_dual_histogram(self) -> go.Figure:
"""Create side-by-side linear and log2 histograms."""
latencies = self.collector.get_all_latencies()
# Create subplots
fig = make_subplots(
rows=1,
cols=2,
subplot_titles=("Linear Scale", "Log2 Scale"),
horizontal_spacing=0.12,
)
# Linear histogram
fig.add_trace(
go.Histogram(
x=latencies,
nbinsx=50,
marker_color="rgb(55, 83, 109)",
opacity=0.75,
name="Linear",
),
row=1,
col=1,
)
# Log2 histogram
log2_latencies = np.log2(latencies + 1) # +1 to avoid log2(0)
fig.add_trace(
go.Histogram(
x=log2_latencies,
nbinsx=30,
marker_color="rgb(243, 156, 18)",
opacity=0.75,
name="Log2",
),
row=1,
col=2,
)
# Update axes
fig.update_xaxes(title_text="Latency (µs)", row=1, col=1)
fig.update_xaxes(title_text="log2(Latency in µs)", row=1, col=2)
fig.update_yaxes(title_text="Count", row=1, col=1)
fig.update_yaxes(title_text="Count", row=1, col=2)
fig.update_layout(
title_text="📊 Latency Distribution (Linear vs Log2)",
template="plotly_white",
showlegend=False,
height=450,
)
return fig
def _create_log2_buckets(self) -> go.Figure:
"""Create bar chart of log2 buckets (like BCC histogram)."""
buckets = self.collector.get_histogram_buckets()
if not buckets:
fig = go.Figure()
fig.update_layout(
title="🔥 Log2 Histogram - Waiting for data...", template="plotly_white"
)
return fig
# Sort buckets
sorted_buckets = sorted(buckets.keys())
counts = [buckets[b] for b in sorted_buckets]
# Create labels (e.g., "8-16µs", "16-32µs")
labels = []
hover_text = []
for bucket in sorted_buckets:
lower = 2**bucket
upper = 2 ** (bucket + 1)
labels.append(f"{lower}-{upper}")
# Calculate percentage
total = sum(counts)
pct = (buckets[bucket] / total) * 100 if total > 0 else 0
hover_text.append(
f"Range: {lower}-{upper} µs<br>"
f"Count: {buckets[bucket]:,}<br>"
f"Percentage: {pct:.2f}%"
)
# Create bar chart
fig = go.Figure()
fig.add_trace(
go.Bar(
x=labels,
y=counts,
marker=dict(
color=counts,
colorscale="YlOrRd",
showscale=True,
colorbar=dict(title="Count"),
),
text=counts,
textposition="outside",
hovertext=hover_text,
hoverinfo="text",
)
)
fig.update_layout(
title="🔥 Log2 Histogram (BCC-style buckets)",
xaxis_title="Latency Range (µs)",
yaxis_title="Count",
template="plotly_white",
height=350,
xaxis=dict(tickangle=-45),
)
return fig
def _create_timeseries(self) -> go.Figure:
"""Create time series figure."""
recent = self.collector.get_recent_latencies()
if not recent:
fig = go.Figure()
fig.update_layout(
title="⏱️ Real-time Latency - Waiting for data...",
template="plotly_white",
)
return fig
times = [d["time"] for d in recent]
lats = [d["latency"] for d in recent]
fig = go.Figure()
fig.add_trace(
go.Scatter(
x=times,
y=lats,
mode="lines",
line=dict(color="rgb(231, 76, 60)", width=2),
fill="tozeroy",
fillcolor="rgba(231, 76, 60, 0.2)",
)
)
fig.update_layout(
title="⏱️ Real-time Latency (Last 10,000 samples)",
xaxis_title="Time (seconds)",
yaxis_title="Latency (µs)",
template="plotly_white",
height=300,
)
return fig
def run(self, host: str = "0.0.0.0", port: int = 8050, debug: bool = False):
"""Run the dashboard server."""
print(f"\n{'=' * 60}")
print(f"🚀 Dashboard running at: http://{host}:{port}")
print(" Access from your browser to see live graphs")
print(
" Generate disk I/O to see data: dd if=/dev/zero of=/tmp/test bs=1M count=100"
)
print(f"{'=' * 60}\n")
self.app.run(debug=debug, host=host, port=port)

View File

@ -0,0 +1,96 @@
"""Data collection and management."""
import threading
import time
import numpy as np
from collections import deque
from dataclasses import dataclass
from typing import List, Dict
@dataclass
class LatencyStats:
"""Statistics computed from latency data."""
total: int = 0
mean: float = 0.0
median: float = 0.0
min: float = 0.0
max: float = 0.0
p95: float = 0.0
p99: float = 0.0
@classmethod
def from_array(cls, data: np.ndarray) -> "LatencyStats":
"""Compute stats from numpy array."""
if len(data) == 0:
return cls()
return cls(
total=len(data),
mean=float(np.mean(data)),
median=float(np.median(data)),
min=float(np.min(data)),
max=float(np.max(data)),
p95=float(np.percentile(data, 95)),
p99=float(np.percentile(data, 99)),
)
class LatencyCollector:
"""Collects and manages latency data from BPF."""
def __init__(self, bpf_object, buffer_size: int = 10000):
self.bpf = bpf_object
self.all_latencies: List[float] = []
self.recent_latencies = deque(maxlen=buffer_size) # type: ignore [var-annotated]
self.start_time = time.time()
self._lock = threading.Lock()
self._poll_thread = None
def callback(self, cpu: int, event):
"""Callback for BPF events."""
with self._lock:
self.all_latencies.append(event.delta_us)
self.recent_latencies.append(
{"time": time.time() - self.start_time, "latency": event.delta_us}
)
def start(self):
"""Start collecting data."""
self.bpf["events"].open_perf_buffer(self.callback, struct_name="latency_event")
def poll_loop():
while True:
self.bpf["events"].poll(100)
self._poll_thread = threading.Thread(target=poll_loop, daemon=True)
self._poll_thread.start()
print("✅ Data collection started")
def get_all_latencies(self) -> np.ndarray:
"""Get all latencies as numpy array."""
with self._lock:
return np.array(self.all_latencies) if self.all_latencies else np.array([])
def get_recent_latencies(self) -> List[Dict]:
"""Get recent latencies with timestamps."""
with self._lock:
return list(self.recent_latencies)
def get_stats(self) -> LatencyStats:
"""Compute current statistics."""
return LatencyStats.from_array(self.get_all_latencies())
def get_histogram_buckets(self) -> Dict[int, int]:
"""Get log2 histogram buckets."""
latencies = self.get_all_latencies()
if len(latencies) == 0:
return {}
log_buckets = np.floor(np.log2(latencies + 1)).astype(int)
buckets = {} # type: ignore [var-annotated]
for bucket in log_buckets:
buckets[bucket] = buckets.get(bucket, 0) + 1
return buckets