Add enhanced vfsreadlat BCC example with live plotly and dash graphs on browser

2026-02-12 16:10:59 +00:00 · 2025-10-21 05:36:59 +05:30
parent e98d5684ea
commit 798f07986a
3 changed files with 479 additions and 0 deletions
--- a/BCC-Examples/vfsreadlat_plotly/bpf_program.py
+++ b/BCC-Examples/vfsreadlat_plotly/bpf_program.py
@ -0,0 +1,101 @@
+"""BPF program for tracing VFS read latency."""
+
+from pythonbpf import bpf, map, struct, section, bpfglobal, BPF
+from pythonbpf.helper import ktime, pid
+from pythonbpf.maps import HashMap, PerfEventArray
+from ctypes import c_void_p, c_uint64
+import argparse
+from data_collector import LatencyCollector
+from dashboard import LatencyDashboard
+
+
+@bpf
+@struct
+class latency_event:
+    pid: c_uint64
+    delta_us: c_uint64
+
+
+@bpf
+@map
+def start() -> HashMap:
+    """Map to store start timestamps by PID."""
+    return HashMap(key=c_uint64, value=c_uint64, max_entries=10240)
+
+
+@bpf
+@map
+def events() -> PerfEventArray:
+    """Perf event array for sending latency events to userspace."""
+    return PerfEventArray(key_size=c_uint64, value_size=c_uint64)
+
+
+@bpf
+@section("kprobe/vfs_read")
+def do_entry(ctx: c_void_p) -> c_uint64:
+    """Record start time when vfs_read is called."""
+    p, ts = pid(), ktime()
+    start.update(p, ts)
+    return 0  # type: ignore [return-value]
+
+
+@bpf
+@section("kretprobe/vfs_read")
+def do_return(ctx: c_void_p) -> c_uint64:
+    """Calculate and record latency when vfs_read returns."""
+    p = pid()
+    tsp = start.lookup(p)
+
+    if tsp:
+        delta_ns = ktime() - tsp
+
+        # Only track latencies > 1 microsecond
+        if delta_ns > 1000:
+            evt = latency_event()
+            evt.pid, evt.delta_us = p, delta_ns // 1000
+            events.output(evt)
+
+        start.delete(p)
+
+    return 0  # type: ignore [return-value]
+
+
+@bpf
+@bpfglobal
+def LICENSE() -> str:
+    return "GPL"
+
+
+def parse_args():
+    """Parse command line arguments."""
+    parser = argparse.ArgumentParser(
+        description="Monitor VFS read latency with live dashboard"
+    )
+    parser.add_argument(
+        "--host", default="0.0.0.0", help="Dashboard host (default: 0.0.0.0)"
+    )
+    parser.add_argument(
+        "--port", type=int, default=8050, help="Dashboard port (default: 8050)"
+    )
+    parser.add_argument(
+        "--buffer", type=int, default=10000, help="Recent data buffer size"
+    )
+    return parser.parse_args()
+
+
+args = parse_args()
+
+# Load BPF program
+print("Loading BPF program...")
+b = BPF()
+b.load()
+b.attach_all()
+print("✅ BPF program loaded and attached")
+
+# Setup data collector
+collector = LatencyCollector(b, buffer_size=args.buffer)
+collector.start()
+
+# Create and run dashboard
+dashboard = LatencyDashboard(collector)
+dashboard.run(host=args.host, port=args.port)
--- a/BCC-Examples/vfsreadlat_plotly/dashboard.py
+++ b/BCC-Examples/vfsreadlat_plotly/dashboard.py
@ -0,0 +1,282 @@
+"""Plotly Dash dashboard for visualizing latency data."""
+
+import dash
+from dash import dcc, html
+from dash.dependencies import Input, Output
+import plotly.graph_objects as go
+from plotly.subplots import make_subplots
+import numpy as np
+
+
+class LatencyDashboard:
+    """Interactive dashboard for latency visualization."""
+
+    def __init__(self, collector, title: str = "VFS Read Latency Monitor"):
+        self.collector = collector
+        self.app = dash.Dash(__name__)
+        self.app.title = title
+        self._setup_layout()
+        self._setup_callbacks()
+
+    def _setup_layout(self):
+        """Create dashboard layout."""
+        self.app.layout = html.Div(
+            [
+                html.H1(
+                    "🔥 VFS Read Latency Dashboard",
+                    style={
+                        "textAlign": "center",
+                        "color": "#2c3e50",
+                        "marginBottom": 20,
+                    },
+                ),
+                # Stats cards
+                html.Div(
+                    [
+                        self._create_stat_card(
+                            "total-samples", "📊 Total Samples", "#3498db"
+                        ),
+                        self._create_stat_card(
+                            "mean-latency", "⚡ Mean Latency", "#e74c3c"
+                        ),
+                        self._create_stat_card(
+                            "p99-latency", "🔥 P99 Latency", "#f39c12"
+                        ),
+                    ],
+                    style={
+                        "display": "flex",
+                        "justifyContent": "space-around",
+                        "marginBottom": 30,
+                    },
+                ),
+                # Graphs - ✅ Make sure these IDs match the callback outputs
+                dcc.Graph(id="dual-histogram", style={"height": "450px"}),
+                dcc.Graph(id="log2-buckets", style={"height": "350px"}),
+                dcc.Graph(id="timeseries-graph", style={"height": "300px"}),
+                # Auto-update
+                dcc.Interval(id="interval-component", interval=1000, n_intervals=0),
+            ],
+            style={"padding": 20, "fontFamily": "Arial, sans-serif"},
+        )
+
+    def _create_stat_card(self, id_name: str, title: str, color: str):
+        """Create a statistics card."""
+        return html.Div(
+            [
+                html.H3(title, style={"color": color}),
+                html.H2(id=id_name, style={"fontSize": 48, "color": "#2c3e50"}),
+            ],
+            className="stat-box",
+            style={
+                "background": "white",
+                "padding": 20,
+                "borderRadius": 10,
+                "boxShadow": "0 4px 6px rgba(0,0,0,0.1)",
+                "textAlign": "center",
+                "flex": 1,
+                "margin": "0 10px",
+            },
+        )
+
+    def _setup_callbacks(self):
+        """Setup dashboard callbacks."""
+
+        @self.app.callback(
+            [
+                Output("total-samples", "children"),
+                Output("mean-latency", "children"),
+                Output("p99-latency", "children"),
+                Output("dual-histogram", "figure"),  # ✅ Match layout IDs
+                Output("log2-buckets", "figure"),  # ✅ Match layout IDs
+                Output("timeseries-graph", "figure"),  # ✅ Match layout IDs
+            ],
+            [Input("interval-component", "n_intervals")],
+        )
+        def update_dashboard(n):
+            stats = self.collector.get_stats()
+
+            if stats.total == 0:
+                return self._empty_state()
+
+            return (
+                f"{stats.total:,}",
+                f"{stats.mean:.1f} µs",
+                f"{stats.p99:.1f} µs",
+                self._create_dual_histogram(),
+                self._create_log2_buckets(),
+                self._create_timeseries(),
+            )
+
+    def _empty_state(self):
+        """Return empty state for dashboard."""
+        empty_fig = go.Figure()
+        empty_fig.update_layout(
+            title="Waiting for data... Generate some disk I/O!", template="plotly_white"
+        )
+        # ✅ Return 6 values (3 stats + 3 figures)
+        return "0", "0 µs", "0 µs", empty_fig, empty_fig, empty_fig
+
+    def _create_dual_histogram(self) -> go.Figure:
+        """Create side-by-side linear and log2 histograms."""
+        latencies = self.collector.get_all_latencies()
+
+        # Create subplots
+        fig = make_subplots(
+            rows=1,
+            cols=2,
+            subplot_titles=("Linear Scale", "Log2 Scale"),
+            horizontal_spacing=0.12,
+        )
+
+        # Linear histogram
+        fig.add_trace(
+            go.Histogram(
+                x=latencies,
+                nbinsx=50,
+                marker_color="rgb(55, 83, 109)",
+                opacity=0.75,
+                name="Linear",
+            ),
+            row=1,
+            col=1,
+        )
+
+        # Log2 histogram
+        log2_latencies = np.log2(latencies + 1)  # +1 to avoid log2(0)
+        fig.add_trace(
+            go.Histogram(
+                x=log2_latencies,
+                nbinsx=30,
+                marker_color="rgb(243, 156, 18)",
+                opacity=0.75,
+                name="Log2",
+            ),
+            row=1,
+            col=2,
+        )
+
+        # Update axes
+        fig.update_xaxes(title_text="Latency (µs)", row=1, col=1)
+        fig.update_xaxes(title_text="log2(Latency in µs)", row=1, col=2)
+        fig.update_yaxes(title_text="Count", row=1, col=1)
+        fig.update_yaxes(title_text="Count", row=1, col=2)
+
+        fig.update_layout(
+            title_text="📊 Latency Distribution (Linear vs Log2)",
+            template="plotly_white",
+            showlegend=False,
+            height=450,
+        )
+
+        return fig
+
+    def _create_log2_buckets(self) -> go.Figure:
+        """Create bar chart of log2 buckets (like BCC histogram)."""
+        buckets = self.collector.get_histogram_buckets()
+
+        if not buckets:
+            fig = go.Figure()
+            fig.update_layout(
+                title="🔥 Log2 Histogram - Waiting for data...", template="plotly_white"
+            )
+            return fig
+
+        # Sort buckets
+        sorted_buckets = sorted(buckets.keys())
+        counts = [buckets[b] for b in sorted_buckets]
+
+        # Create labels (e.g., "8-16µs", "16-32µs")
+        labels = []
+        hover_text = []
+        for bucket in sorted_buckets:
+            lower = 2**bucket
+            upper = 2 ** (bucket + 1)
+            labels.append(f"{lower}-{upper}")
+
+            # Calculate percentage
+            total = sum(counts)
+            pct = (buckets[bucket] / total) * 100 if total > 0 else 0
+            hover_text.append(
+                f"Range: {lower}-{upper} µs<br>"
+                f"Count: {buckets[bucket]:,}<br>"
+                f"Percentage: {pct:.2f}%"
+            )
+
+        # Create bar chart
+        fig = go.Figure()
+
+        fig.add_trace(
+            go.Bar(
+                x=labels,
+                y=counts,
+                marker=dict(
+                    color=counts,
+                    colorscale="YlOrRd",
+                    showscale=True,
+                    colorbar=dict(title="Count"),
+                ),
+                text=counts,
+                textposition="outside",
+                hovertext=hover_text,
+                hoverinfo="text",
+            )
+        )
+
+        fig.update_layout(
+            title="🔥 Log2 Histogram (BCC-style buckets)",
+            xaxis_title="Latency Range (µs)",
+            yaxis_title="Count",
+            template="plotly_white",
+            height=350,
+            xaxis=dict(tickangle=-45),
+        )
+
+        return fig
+
+    def _create_timeseries(self) -> go.Figure:
+        """Create time series figure."""
+        recent = self.collector.get_recent_latencies()
+
+        if not recent:
+            fig = go.Figure()
+            fig.update_layout(
+                title="⏱️ Real-time Latency - Waiting for data...",
+                template="plotly_white",
+            )
+            return fig
+
+        times = [d["time"] for d in recent]
+        lats = [d["latency"] for d in recent]
+
+        fig = go.Figure()
+        fig.add_trace(
+            go.Scatter(
+                x=times,
+                y=lats,
+                mode="lines",
+                line=dict(color="rgb(231, 76, 60)", width=2),
+                fill="tozeroy",
+                fillcolor="rgba(231, 76, 60, 0.2)",
+            )
+        )
+
+        fig.update_layout(
+            title="⏱️ Real-time Latency (Last 10,000 samples)",
+            xaxis_title="Time (seconds)",
+            yaxis_title="Latency (µs)",
+            template="plotly_white",
+            height=300,
+        )
+
+        return fig
+
+    def run(self, host: str = "0.0.0.0", port: int = 8050, debug: bool = False):
+        """Run the dashboard server."""
+        print(f"\n{'=' * 60}")
+        print(f"🚀 Dashboard running at: http://{host}:{port}")
+        print("   Access from your browser to see live graphs")
+        print(
+            "   Generate disk I/O to see data: dd if=/dev/zero of=/tmp/test bs=1M count=100"
+        )
+        print(f"{'=' * 60}\n")
+        self.app.run(debug=debug, host=host, port=port)
--- a/BCC-Examples/vfsreadlat_plotly/data_collector.py
+++ b/BCC-Examples/vfsreadlat_plotly/data_collector.py
@ -0,0 +1,96 @@
+"""Data collection and management."""
+
+import threading
+import time
+import numpy as np
+from collections import deque
+from dataclasses import dataclass
+from typing import List, Dict
+
+
+@dataclass
+class LatencyStats:
+    """Statistics computed from latency data."""
+
+    total: int = 0
+    mean: float = 0.0
+    median: float = 0.0
+    min: float = 0.0
+    max: float = 0.0
+    p95: float = 0.0
+    p99: float = 0.0
+
+    @classmethod
+    def from_array(cls, data: np.ndarray) -> "LatencyStats":
+        """Compute stats from numpy array."""
+        if len(data) == 0:
+            return cls()
+
+        return cls(
+            total=len(data),
+            mean=float(np.mean(data)),
+            median=float(np.median(data)),
+            min=float(np.min(data)),
+            max=float(np.max(data)),
+            p95=float(np.percentile(data, 95)),
+            p99=float(np.percentile(data, 99)),
+        )
+
+
+class LatencyCollector:
+    """Collects and manages latency data from BPF."""
+
+    def __init__(self, bpf_object, buffer_size: int = 10000):
+        self.bpf = bpf_object
+        self.all_latencies: List[float] = []
+        self.recent_latencies = deque(maxlen=buffer_size)  # type: ignore [var-annotated]
+        self.start_time = time.time()
+        self._lock = threading.Lock()
+        self._poll_thread = None
+
+    def callback(self, cpu: int, event):
+        """Callback for BPF events."""
+        with self._lock:
+            self.all_latencies.append(event.delta_us)
+            self.recent_latencies.append(
+                {"time": time.time() - self.start_time, "latency": event.delta_us}
+            )
+
+    def start(self):
+        """Start collecting data."""
+        self.bpf["events"].open_perf_buffer(self.callback, struct_name="latency_event")
+
+        def poll_loop():
+            while True:
+                self.bpf["events"].poll(100)
+
+        self._poll_thread = threading.Thread(target=poll_loop, daemon=True)
+        self._poll_thread.start()
+        print("✅ Data collection started")
+
+    def get_all_latencies(self) -> np.ndarray:
+        """Get all latencies as numpy array."""
+        with self._lock:
+            return np.array(self.all_latencies) if self.all_latencies else np.array([])
+
+    def get_recent_latencies(self) -> List[Dict]:
+        """Get recent latencies with timestamps."""
+        with self._lock:
+            return list(self.recent_latencies)
+
+    def get_stats(self) -> LatencyStats:
+        """Compute current statistics."""
+        return LatencyStats.from_array(self.get_all_latencies())
+
+    def get_histogram_buckets(self) -> Dict[int, int]:
+        """Get log2 histogram buckets."""
+        latencies = self.get_all_latencies()
+        if len(latencies) == 0:
+            return {}
+
+        log_buckets = np.floor(np.log2(latencies + 1)).astype(int)
+        buckets = {}  # type: ignore [var-annotated]
+        for bucket in log_buckets:
+            buckets[bucket] = buckets.get(bucket, 0) + 1
+
+        return buckets