From bbe499087892846e3cf80f7ad876955ec54086e2 Mon Sep 17 00:00:00 2001 From: varun-r-mallya Date: Fri, 28 Nov 2025 20:52:48 +0530 Subject: [PATCH] add container-monitor example --- BCC-Examples/container-monitor/README.md | 49 +++ .../container-monitor/container_monitor.py | 221 +++++++++++++ .../container-monitor/data_collector.py | 208 ++++++++++++ BCC-Examples/container-monitor/file_io.bpf.py | 191 ----------- .../container-monitor/net_stats.bpf.py | 192 ------------ BCC-Examples/container-monitor/syscall.bpf.py | 132 -------- BCC-Examples/container-monitor/tui.py | 296 ++++++++++++++++++ 7 files changed, 774 insertions(+), 515 deletions(-) create mode 100644 BCC-Examples/container-monitor/README.md create mode 100644 BCC-Examples/container-monitor/container_monitor.py create mode 100644 BCC-Examples/container-monitor/data_collector.py delete mode 100644 BCC-Examples/container-monitor/file_io.bpf.py delete mode 100644 BCC-Examples/container-monitor/net_stats.bpf.py delete mode 100644 BCC-Examples/container-monitor/syscall.bpf.py create mode 100644 BCC-Examples/container-monitor/tui.py diff --git a/BCC-Examples/container-monitor/README.md b/BCC-Examples/container-monitor/README.md new file mode 100644 index 0000000..dc27049 --- /dev/null +++ b/BCC-Examples/container-monitor/README.md @@ -0,0 +1,49 @@ +# Container Monitor TUI + +A beautiful terminal-based container monitoring tool that combines syscall tracking, file I/O monitoring, and network traffic analysis using eBPF. + +## Features + +- 🎯 **Interactive Cgroup Selection** - Navigate and select cgroups with arrow keys +- 📊 **Real-time Monitoring** - Live graphs and statistics +- 🔥 **Syscall Tracking** - Total syscall count per cgroup +- 💾 **File I/O Monitoring** - Read/write operations and bytes with graphs +- 🌐 **Network Traffic** - RX/TX packets and bytes with live graphs +- ⚡ **Efficient Caching** - Reduced /proc lookups for better performance +- 🎨 **Beautiful TUI** - Clean, colorful terminal interface + +## Requirements + +- Python 3.7+ +- pythonbpf +- Root privileges (for eBPF) + +## Installation + +```bash +# Ensure you have pythonbpf installed +pip install pythonbpf + +# Run the monitor +sudo $(which python) container_monitor.py +``` + +## Usage + +1. **Selection Screen**: Use ↑↓ arrow keys to navigate through cgroups, press ENTER to select +2. **Monitoring Screen**: View real-time graphs and statistics, press ESC or 'b' to go back +3. **Exit**: Press 'q' at any time to quit + +## Architecture + +- `container_monitor.py` - Main BPF program combining all three tracers +- `data_collector.py` - Data collection, caching, and history management +- `tui. py` - Terminal user interface with selection and monitoring screens + +## BPF Programs + +- **vfs_read/vfs_write** - Track file I/O operations +- **__netif_receive_skb/__dev_queue_xmit** - Track network traffic +- **raw_syscalls/sys_enter** - Count all syscalls + +All programs filter by cgroup ID for per-container monitoring. diff --git a/BCC-Examples/container-monitor/container_monitor.py b/BCC-Examples/container-monitor/container_monitor.py new file mode 100644 index 0000000..07a29c1 --- /dev/null +++ b/BCC-Examples/container-monitor/container_monitor.py @@ -0,0 +1,221 @@ +"""Container Monitor - TUI-based cgroup monitoring combining syscall, file I/O, and network tracking.""" + +import time +import os +from pathlib import Path +from pythonbpf import bpf, map, section, bpfglobal, struct, BPF +from pythonbpf.maps import HashMap +from pythonbpf.helper import get_current_cgroup_id +from ctypes import c_int32, c_uint64, c_void_p +from vmlinux import struct_pt_regs, struct_sk_buff + +from data_collector import ContainerDataCollector +from tui import ContainerMonitorTUI + + +# ==================== BPF Structs ==================== + +@bpf +@struct +class read_stats: + bytes: c_uint64 + ops: c_uint64 + + +@bpf +@struct +class write_stats: + bytes: c_uint64 + ops: c_uint64 + + +@bpf +@struct +class net_stats: + rx_packets: c_uint64 + tx_packets: c_uint64 + rx_bytes: c_uint64 + tx_bytes: c_uint64 + + +# ==================== BPF Maps ==================== + +@bpf +@map +def read_map() -> HashMap: + return HashMap(key=c_uint64, value=read_stats, max_entries=1024) + + +@bpf +@map +def write_map() -> HashMap: + return HashMap(key=c_uint64, value=write_stats, max_entries=1024) + + +@bpf +@map +def net_stats_map() -> HashMap: + return HashMap(key=c_uint64, value=net_stats, max_entries=1024) + + +@bpf +@map +def syscall_count() -> HashMap: + return HashMap(key=c_uint64, value=c_uint64, max_entries=1024) + + +# ==================== File I/O Tracing ==================== + +@bpf +@section("kprobe/vfs_read") +def trace_read(ctx: struct_pt_regs) -> c_int32: + cg = get_current_cgroup_id() + count = c_uint64(ctx.dx) + ptr = read_map.lookup(cg) + if ptr: + s = read_stats() + s.bytes = ptr.bytes + count + s.ops = ptr.ops + 1 + read_map.update(cg, s) + else: + s = read_stats() + s.bytes = count + s.ops = c_uint64(1) + read_map.update(cg, s) + + return c_int32(0) + + +@bpf +@section("kprobe/vfs_write") +def trace_write(ctx1: struct_pt_regs) -> c_int32: + cg = get_current_cgroup_id() + count = c_uint64(ctx1.dx) + ptr = write_map.lookup(cg) + + if ptr: + s = write_stats() + s.bytes = ptr.bytes + count + s.ops = ptr.ops + 1 + write_map.update(cg, s) + else: + s = write_stats() + s.bytes = count + s.ops = c_uint64(1) + write_map.update(cg, s) + + return c_int32(0) + + +# ==================== Network I/O Tracing ==================== + +@bpf +@section("kprobe/__netif_receive_skb") +def trace_netif_rx(ctx2: struct_pt_regs) -> c_int32: + cgroup_id = get_current_cgroup_id() + skb = struct_sk_buff(ctx2.di) + pkt_len = c_uint64(skb.len) + + stats_ptr = net_stats_map.lookup(cgroup_id) + + if stats_ptr: + stats = net_stats() + stats.rx_packets = stats_ptr.rx_packets + 1 + stats.tx_packets = stats_ptr.tx_packets + stats.rx_bytes = stats_ptr.rx_bytes + pkt_len + stats.tx_bytes = stats_ptr.tx_bytes + net_stats_map.update(cgroup_id, stats) + else: + stats = net_stats() + stats.rx_packets = c_uint64(1) + stats.tx_packets = c_uint64(0) + stats.rx_bytes = pkt_len + stats.tx_bytes = c_uint64(0) + net_stats_map.update(cgroup_id, stats) + + return c_int32(0) + + +@bpf +@section("kprobe/__dev_queue_xmit") +def trace_dev_xmit(ctx3: struct_pt_regs) -> c_int32: + cgroup_id = get_current_cgroup_id() + skb = struct_sk_buff(ctx3.di) + pkt_len = c_uint64(skb.len) + + stats_ptr = net_stats_map.lookup(cgroup_id) + + if stats_ptr: + stats = net_stats() + stats.rx_packets = stats_ptr.rx_packets + stats.tx_packets = stats_ptr.tx_packets + 1 + stats.rx_bytes = stats_ptr.rx_bytes + stats.tx_bytes = stats_ptr.tx_bytes + pkt_len + net_stats_map.update(cgroup_id, stats) + else: + stats = net_stats() + stats.rx_packets = c_uint64(0) + stats.tx_packets = c_uint64(1) + stats.rx_bytes = c_uint64(0) + stats.tx_bytes = pkt_len + net_stats_map.update(cgroup_id, stats) + + return c_int32(0) + + +# ==================== Syscall Tracing ==================== + +@bpf +@section("tracepoint/raw_syscalls/sys_enter") +def count_syscalls(ctx: c_void_p) -> c_int32: + cgroup_id = get_current_cgroup_id() + count_ptr = syscall_count.lookup(cgroup_id) + + if count_ptr: + new_count = count_ptr + c_uint64(1) + syscall_count.update(cgroup_id, new_count) + else: + syscall_count.update(cgroup_id, c_uint64(1)) + + return c_int32(0) + + +@bpf +@bpfglobal +def LICENSE() -> str: + return "GPL" + + +# ==================== Main ==================== + +if __name__ == "__main__": + print("🔥 Loading BPF programs...") + + # Load and attach BPF program + b = BPF() + b.load() + b.attach_all() + + # Get map references and enable struct deserialization + read_map_ref = b["read_map"] + write_map_ref = b["write_map"] + net_stats_map_ref = b["net_stats_map"] + syscall_count_ref = b["syscall_count"] + + read_map_ref.set_value_struct("read_stats") + write_map_ref.set_value_struct("write_stats") + net_stats_map_ref.set_value_struct("net_stats") + + print("✅ BPF programs loaded and attached") + + # Setup data collector + collector = ContainerDataCollector( + read_map_ref, + write_map_ref, + net_stats_map_ref, + syscall_count_ref + ) + + # Create and run TUI + tui = ContainerMonitorTUI(collector) + tui.run() diff --git a/BCC-Examples/container-monitor/data_collector.py b/BCC-Examples/container-monitor/data_collector.py new file mode 100644 index 0000000..47c2b49 --- /dev/null +++ b/BCC-Examples/container-monitor/data_collector.py @@ -0,0 +1,208 @@ +"""Data collection and management for container monitoring.""" + +import os +import time +from pathlib import Path +from typing import Dict, List, Set, Optional, Tuple +from dataclasses import dataclass +from collections import deque, defaultdict + + +@dataclass +class CgroupInfo: + """Information about a cgroup.""" + id: int + name: str + path: str + + +@dataclass +class ContainerStats: + """Statistics for a container/cgroup.""" + cgroup_id: int + cgroup_name: str + + # File I/O + read_ops: int = 0 + read_bytes: int = 0 + write_ops: int = 0 + write_bytes: int = 0 + + # Network I/O + rx_packets: int = 0 + rx_bytes: int = 0 + tx_packets: int = 0 + tx_bytes: int = 0 + + # Syscalls + syscall_count: int = 0 + + # Timestamp + timestamp: float = 0.0 + + +class ContainerDataCollector: + """Collects and manages container monitoring data from BPF.""" + + def __init__(self, read_map, write_map, net_stats_map, syscall_map, history_size: int = 100): + self.read_map = read_map + self.write_map = write_map + self.net_stats_map = net_stats_map + self.syscall_map = syscall_map + + # Caching + self._cgroup_cache: Dict[int, CgroupInfo] = {} + self._cgroup_cache_time = 0 + self._cache_ttl = 5. + 0 # Refresh cache every 5 seconds + + # Historical data for graphing + self._history_size = history_size + self._history: Dict[int, deque] = defaultdict(lambda: deque(maxlen=history_size)) + + def get_all_cgroups(self) -> List[CgroupInfo]: + """Get all cgroups with caching.""" + current_time = time.time() + + # Use cached data if still valid + if current_time - self._cgroup_cache_time < self._cache_ttl: + return list(self._cgroup_cache.values()) + + # Refresh cache + self._refresh_cgroup_cache() + return list(self._cgroup_cache.values()) + + def _refresh_cgroup_cache(self): + """Refresh the cgroup cache from /proc.""" + cgroup_map: Dict[int, Set[str]] = defaultdict(set) + + # Scan /proc to find all cgroups + for proc_dir in Path("/proc").glob("[0-9]*"): + try: + cgroup_file = proc_dir / "cgroup" + if not cgroup_file.exists(): + continue + + with open(cgroup_file) as f: + for line in f: + parts = line.strip().split(":") + if len(parts) >= 3: + cgroup_path = parts[2] + cgroup_mount = f"/sys/fs/cgroup{cgroup_path}" + + if os.path.exists(cgroup_mount): + stat_info = os.stat(cgroup_mount) + cgroup_id = stat_info.st_ino + cgroup_map[cgroup_id].add(cgroup_path) + + except (PermissionError, FileNotFoundError, OSError): + continue + + # Update cache with best names + new_cache = {} + for cgroup_id, paths in cgroup_map.items(): + # Pick the most descriptive path + best_path = self._get_best_cgroup_path(paths) + name = self._get_cgroup_name(best_path) + + new_cache[cgroup_id] = CgroupInfo( + id=cgroup_id, + name=name, + path=best_path + ) + + self._cgroup_cache = new_cache + self._cgroup_cache_time = time.time() + + def _get_best_cgroup_path(self, paths: Set[str]) -> str: + """Select the most descriptive cgroup path.""" + path_list = list(paths) + + # Prefer paths with more components (more specific) + # Prefer paths containing docker, podman, etc. + for keyword in ['docker', 'podman', 'kubernetes', 'k8s', 'systemd']: + for path in path_list: + if keyword in path.lower(): + return path + + # Return longest path (most specific) + return max(path_list, key=lambda p: (len(p.split('/')), len(p))) + + def _get_cgroup_name(self, path: str) -> str: + """Extract a friendly name from cgroup path.""" + if not path or path == "/": + return "root" + + # Remove leading/trailing slashes + path = path.strip("/") + + # Try to extract container ID or service name + parts = path.split("/") + + # For Docker: /docker/ + if "docker" in path.lower(): + for i, part in enumerate(parts): + if part.lower() == "docker" and i + 1 < len(parts): + container_id = parts[i + 1][:12] # Short ID + return f"docker:{container_id}" + + # For systemd services + if "system. slice" in path: + for part in parts: + if part.endswith(". service"): + return part.replace(".service", "") + + # For user slices + if "user.slice" in path: + return f"user:{parts[-1]}" if parts else "user" + + # Default: use last component + return parts[-1] if parts else path + + def get_stats_for_cgroup(self, cgroup_id: int) -> ContainerStats: + """Get current statistics for a specific cgroup.""" + cgroup_info = self._cgroup_cache.get(cgroup_id) + cgroup_name = cgroup_info.name if cgroup_info else f"cgroup-{cgroup_id}" + + stats = ContainerStats( + cgroup_id=cgroup_id, + cgroup_name=cgroup_name, + timestamp=time.time() + ) + + # Get file I/O stats + read_stat = self.read_map.lookup(cgroup_id) + if read_stat: + stats.read_ops = int(read_stat.ops) + stats.read_bytes = int(read_stat.bytes) + + write_stat = self.write_map.lookup(cgroup_id) + if write_stat: + stats.write_ops = int(write_stat.ops) + stats.write_bytes = int(write_stat.bytes) + + # Get network stats + net_stat = self.net_stats_map.lookup(cgroup_id) + if net_stat: + stats.rx_packets = int(net_stat.rx_packets) + stats.rx_bytes = int(net_stat.rx_bytes) + stats.tx_packets = int(net_stat.tx_packets) + stats.tx_bytes = int(net_stat.tx_bytes) + + # Get syscall count + syscall_cnt = self.syscall_map.lookup(cgroup_id) + if syscall_cnt is not None: + stats.syscall_count = int(syscall_cnt) + + # Add to history + self._history[cgroup_id].append(stats) + + return stats + + def get_history(self, cgroup_id: int) -> List[ContainerStats]: + """Get historical statistics for graphing.""" + return list(self._history[cgroup_id]) + + def get_cgroup_info(self, cgroup_id: int) -> Optional[CgroupInfo]: + """Get cached cgroup information.""" + return self._cgroup_cache.get(cgroup_id) diff --git a/BCC-Examples/container-monitor/file_io.bpf.py b/BCC-Examples/container-monitor/file_io.bpf.py deleted file mode 100644 index b908a95..0000000 --- a/BCC-Examples/container-monitor/file_io.bpf.py +++ /dev/null @@ -1,191 +0,0 @@ -import logging -import time -import os -from pathlib import Path -from pythonbpf import bpf, map, section, bpfglobal, struct, compile, BPF -from pythonbpf.maps import HashMap -from pythonbpf.helper import get_current_cgroup_id -from ctypes import c_int32, c_uint64 -from vmlinux import struct_pt_regs - - -@bpf -@struct -class read_stats: - bytes: c_uint64 - ops: c_uint64 - - -@bpf -@struct -class write_stats: - bytes: c_uint64 - ops: c_uint64 - - -@bpf -@map -def read_map() -> HashMap: - return HashMap(key=c_uint64, value=read_stats, max_entries=1024) - - -@bpf -@map -def write_map() -> HashMap: - return HashMap(key=c_uint64, value=write_stats, max_entries=1024) - - -@bpf -@section("kprobe/vfs_read") -def trace_read(ctx: struct_pt_regs) -> c_int32: - cg = get_current_cgroup_id() - count = c_uint64(ctx.dx) - ptr = read_map.lookup(cg) - if ptr: - s = read_stats() - s.bytes = ptr.bytes + count - s.ops = ptr.ops + 1 - read_map.update(cg, s) - else: - s = read_stats() - s.bytes = count - s.ops = c_uint64(1) - read_map.update(cg, s) - - return c_int32(0) - - -@bpf -@section("kprobe/vfs_write") -def trace_write(ctx1: struct_pt_regs) -> c_int32: - cg = get_current_cgroup_id() - count = c_uint64(ctx1.dx) - ptr = write_map.lookup(cg) - - if ptr: - s = write_stats() - s.bytes = ptr.bytes + count - s.ops = ptr.ops + 1 - write_map.update(cg, s) - else: - s = write_stats() - s.bytes = count - s.ops = c_uint64(1) - write_map.update(cg, s) - - return c_int32(0) - - -@bpf -@bpfglobal -def LICENSE() -> str: - return "GPL" - - -# Load and attach BPF program -b = BPF() -b.load() -b.attach_all() - -# Get map references and enable struct deserialization -read_map_ref = b["read_map"] -write_map_ref = b["write_map"] -read_map_ref.set_value_struct("read_stats") -write_map_ref.set_value_struct("write_stats") - - -def get_cgroup_ids(): - """Get all cgroup IDs from the system""" - cgroup_ids = set() - - # Get cgroup IDs from running processes - for proc_dir in Path("/proc").glob("[0-9]*"): - try: - cgroup_file = proc_dir / "cgroup" - if cgroup_file.exists(): - with open(cgroup_file) as f: - for line in f: - # Parse cgroup path and get inode - parts = line.strip().split(":") - if len(parts) >= 3: - cgroup_path = parts[2] - # Try to get the cgroup inode which is used as ID - cgroup_mount = f"/sys/fs/cgroup{cgroup_path}" - if os.path.exists(cgroup_mount): - stat_info = os.stat(cgroup_mount) - cgroup_ids.add(stat_info.st_ino) - except (PermissionError, FileNotFoundError, OSError): - continue - - return cgroup_ids - - -# Display function -def display_stats(): - """Read and display file I/O statistics from BPF maps""" - print("\n" + "=" * 80) - print(f"{'CGROUP ID':<20} {'READ OPS':<15} {'READ BYTES':<20} {'WRITE OPS':<15} {'WRITE BYTES':<20}") - print("=" * 80) - - # Get cgroup IDs from the system - cgroup_ids = get_cgroup_ids() - - if not cgroup_ids: - print("No cgroups found...") - print("=" * 80) - return - - # Initialize totals - total_read_ops = 0 - total_read_bytes = 0 - total_write_ops = 0 - total_write_bytes = 0 - - # Track which cgroups have data - cgroups_with_data = [] - - # Display stats for each cgroup - for cgroup_id in sorted(cgroup_ids): - # Get read stats using lookup - read_ops = 0 - read_bytes = 0 - read_stat = read_map_ref.lookup(cgroup_id) - if read_stat: - read_ops = int(read_stat.ops) - read_bytes = int(read_stat.bytes) - total_read_ops += read_ops - total_read_bytes += read_bytes - - # Get write stats using lookup - write_ops = 0 - write_bytes = 0 - write_stat = write_map_ref.lookup(cgroup_id) - if write_stat: - write_ops = int(write_stat.ops) - write_bytes = int(write_stat.bytes) - total_write_ops += write_ops - total_write_bytes += write_bytes - - # Only print if there's data for this cgroup - if read_stat or write_stat: - print(f"{cgroup_id:<20} {read_ops:<15} {read_bytes:<20} {write_ops:<15} {write_bytes:<20}") - cgroups_with_data.append(cgroup_id) - - if not cgroups_with_data: - print("No data collected yet...") - - print("=" * 80) - print(f"{'TOTAL':<20} {total_read_ops:<15} {total_read_bytes:<20} {total_write_ops:<15} {total_write_bytes:<20}") - print() - - -# Main loop -if __name__ == "__main__": - print("Tracing file I/O operations... Press Ctrl+C to exit\n") - - try: - while True: - time.sleep(5) # Update every 5 seconds - display_stats() - except KeyboardInterrupt: - print("\nStopped") diff --git a/BCC-Examples/container-monitor/net_stats.bpf.py b/BCC-Examples/container-monitor/net_stats.bpf.py deleted file mode 100644 index b5e3383..0000000 --- a/BCC-Examples/container-monitor/net_stats.bpf.py +++ /dev/null @@ -1,192 +0,0 @@ -import logging -import time -import os -from pathlib import Path -from pythonbpf import bpf, map, section, bpfglobal, struct, compile, BPF -from pythonbpf.maps import HashMap -from pythonbpf.helper import get_current_cgroup_id -from ctypes import c_int32, c_uint64, c_void_p, c_uint32 -from vmlinux import struct_sk_buff, struct_pt_regs - - -@bpf -@struct -class net_stats: - rx_packets: c_uint64 - tx_packets: c_uint64 - rx_bytes: c_uint64 - tx_bytes: c_uint64 - - -@bpf -@map -def net_stats_map() -> HashMap: - return HashMap(key=c_uint64, value=net_stats, max_entries=1024) - - -@bpf -@section("kprobe/__netif_receive_skb") -def trace_netif_rx(ctx: struct_pt_regs) -> c_int32: - cgroup_id = get_current_cgroup_id() - - # Read skb pointer from first argument (PT_REGS_PARM1) - skb = struct_sk_buff(ctx.di) # x86_64: first arg in rdi - - # Read skb->len - pkt_len = c_uint64(skb.len) - - - stats_ptr = net_stats_map.lookup(cgroup_id) - - if stats_ptr: - stats = net_stats() - stats.rx_packets = stats_ptr.rx_packets + 1 - stats.tx_packets = stats_ptr.tx_packets - stats.rx_bytes = stats_ptr.rx_bytes + pkt_len - stats.tx_bytes = stats_ptr.tx_bytes - net_stats_map.update(cgroup_id, stats) - else: - stats = net_stats() - stats.rx_packets = c_uint64(1) - stats.tx_packets = c_uint64(0) - stats.rx_bytes = pkt_len - stats.tx_bytes = c_uint64(0) - net_stats_map.update(cgroup_id, stats) - - return c_int32(0) - -@bpf -@section("kprobe/__dev_queue_xmit") -def trace_dev_xmit(ctx1: struct_pt_regs) -> c_int32: - cgroup_id = get_current_cgroup_id() - - # Read skb pointer from first argument - skb = struct_sk_buff(ctx1.di) - pkt_len = c_uint64(skb.len) - - stats_ptr = net_stats_map.lookup(cgroup_id) - - if stats_ptr: - stats = net_stats() - stats.rx_packets = stats_ptr.rx_packets - stats.tx_packets = stats_ptr.tx_packets + 1 - stats.rx_bytes = stats_ptr.rx_bytes - stats.tx_bytes = stats_ptr.tx_bytes + pkt_len - net_stats_map.update(cgroup_id, stats) - else: - stats = net_stats() - stats.rx_packets = c_uint64(0) - stats.tx_packets = c_uint64(1) - stats.rx_bytes = c_uint64(0) - stats.tx_bytes = pkt_len - net_stats_map.update(cgroup_id, stats) - - return c_int32(0) - - -@bpf -@bpfglobal -def LICENSE() -> str: - return "GPL" - -# Load and attach BPF program -b = BPF() -b.load() -b.attach_all() - -# Get map reference and enable struct deserialization -net_stats_map_ref = b["net_stats_map"] -net_stats_map_ref.set_value_struct("net_stats") - - -def get_cgroup_ids(): - """Get all cgroup IDs from the system""" - cgroup_ids = set() - - # Get cgroup IDs from running processes - for proc_dir in Path("/proc").glob("[0-9]*"): - try: - cgroup_file = proc_dir / "cgroup" - if cgroup_file.exists(): - with open(cgroup_file) as f: - for line in f: - # Parse cgroup path and get inode - parts = line.strip().split(":") - if len(parts) >= 3: - cgroup_path = parts[2] - # Try to get the cgroup inode which is used as ID - cgroup_mount = f"/sys/fs/cgroup{cgroup_path}" - if os.path.exists(cgroup_mount): - stat_info = os.stat(cgroup_mount) - cgroup_ids.add(stat_info.st_ino) - except (PermissionError, FileNotFoundError, OSError): - continue - - return cgroup_ids - - -# Display function -def display_stats(): - """Read and display network I/O statistics from BPF maps""" - print("\n" + "=" * 100) - print(f"{'CGROUP ID':<20} {'RX PACKETS':<15} {'RX BYTES':<20} {'TX PACKETS':<15} {'TX BYTES':<20}") - print("=" * 100) - - # Get cgroup IDs from the system - cgroup_ids = get_cgroup_ids() - - if not cgroup_ids: - print("No cgroups found...") - print("=" * 100) - return - - # Initialize totals - total_rx_packets = 0 - total_rx_bytes = 0 - total_tx_packets = 0 - total_tx_bytes = 0 - - # Track which cgroups have data - cgroups_with_data = [] - - # Display stats for each cgroup - for cgroup_id in sorted(cgroup_ids): - # Get network stats using lookup - rx_packets = 0 - rx_bytes = 0 - tx_packets = 0 - tx_bytes = 0 - - net_stat = net_stats_map_ref.lookup(cgroup_id) - if net_stat: - rx_packets = int(net_stat.rx_packets) - rx_bytes = int(net_stat.rx_bytes) - tx_packets = int(net_stat.tx_packets) - tx_bytes = int(net_stat.tx_bytes) - - total_rx_packets += rx_packets - total_rx_bytes += rx_bytes - total_tx_packets += tx_packets - total_tx_bytes += tx_bytes - - print(f"{cgroup_id:<20} {rx_packets:<15} {rx_bytes:<20} {tx_packets:<15} {tx_bytes:<20}") - cgroups_with_data.append(cgroup_id) - - if not cgroups_with_data: - print("No data collected yet...") - - print("=" * 100) - print(f"{'TOTAL':<20} {total_rx_packets:<15} {total_rx_bytes:<20} {total_tx_packets:<15} {total_tx_bytes:<20}") - print() - - -# Main loop -if __name__ == "__main__": - print("Tracing network I/O operations... Press Ctrl+C to exit\n") - - try: - while True: - time.sleep(5) # Update every 5 seconds - display_stats() - except KeyboardInterrupt: - print("\nStopped") diff --git a/BCC-Examples/container-monitor/syscall.bpf.py b/BCC-Examples/container-monitor/syscall.bpf.py deleted file mode 100644 index 716a79e..0000000 --- a/BCC-Examples/container-monitor/syscall.bpf.py +++ /dev/null @@ -1,132 +0,0 @@ -import time -import os -from pathlib import Path -from pythonbpf import bpf, map, section, bpfglobal, BPF -from pythonbpf.maps import HashMap -from pythonbpf.helper import get_current_cgroup_id -from ctypes import c_void_p, c_int32, c_uint64 - - -@bpf -@map -def syscall_count() -> HashMap: - """Map tracking syscall count by cgroup ID""" - return HashMap(key=c_uint64, value=c_uint64, max_entries=1024) - - -@bpf -@section("tracepoint/raw_syscalls/sys_enter") -def count_syscalls(ctx: c_void_p) -> c_int32: - """ - Increment syscall counter for the current cgroup. - Attached to raw_syscalls/sys_enter tracepoint to catch all syscalls. - """ - cgroup_id = get_current_cgroup_id() - - # Lookup current count - count_ptr = syscall_count.lookup(cgroup_id) - - if count_ptr: - # Increment existing count - new_count = count_ptr + c_uint64(1) - syscall_count.update(cgroup_id, new_count) - else: - # First syscall for this cgroup - syscall_count.update(cgroup_id, c_uint64(1)) - - return c_int32(0) - - -@bpf -@bpfglobal -def LICENSE() -> str: - return "GPL" - - -# Load and attach BPF program -b = BPF() -b.load() -b.attach_all() - -# Get map reference -syscall_count_ref = b["syscall_count"] - - -def get_cgroup_ids(): - """Get all cgroup IDs from the system""" - cgroup_ids = set() - - # Get cgroup IDs from running processes - for proc_dir in Path("/proc").glob("[0-9]*"): - try: - cgroup_file = proc_dir / "cgroup" - if cgroup_file.exists(): - with open(cgroup_file) as f: - for line in f: - # Parse cgroup path and get inode - parts = line.strip().split(":") - if len(parts) >= 3: - cgroup_path = parts[2] - # Try to get the cgroup inode which is used as ID - cgroup_mount = f"/sys/fs/cgroup{cgroup_path}" - if os.path.exists(cgroup_mount): - stat_info = os.stat(cgroup_mount) - cgroup_ids.add(stat_info.st_ino) - except (PermissionError, FileNotFoundError, OSError): - continue - - return cgroup_ids - - -# Display function -def display_stats(): - """Read and display syscall statistics from BPF maps""" - print("\n" + "=" * 60) - print(f"{'CGROUP ID':<20} {'SYSCALL COUNT':<20}") - print("=" * 60) - - # Get cgroup IDs from the system - cgroup_ids = get_cgroup_ids() - - if not cgroup_ids: - print("No cgroups found...") - print("=" * 60) - return - - # Initialize totals - total_syscalls = 0 - - # Track which cgroups have data - cgroups_with_data = [] - - # Display stats for each cgroup - for cgroup_id in sorted(cgroup_ids): - # Get syscall count using lookup - syscall_cnt = 0 - - count = syscall_count_ref.lookup(cgroup_id) - if count is not None: - syscall_cnt = int(count) - total_syscalls += syscall_cnt - - print(f"{cgroup_id:<20} {syscall_cnt:<20}") - cgroups_with_data.append(cgroup_id) - - if not cgroups_with_data: - print("No data collected yet...") - - print("=" * 60) - print(f"{'TOTAL':<20} {total_syscalls:<20}") - print() - - -# Main loop -if __name__ == "__main__": - print("Tracing syscalls per cgroup... Press Ctrl+C to exit\n") - - try: - while True: - time.sleep(5) # Update every 5 seconds - display_stats() - except KeyboardInterrupt: - print("\nStopped") diff --git a/BCC-Examples/container-monitor/tui.py b/BCC-Examples/container-monitor/tui.py new file mode 100644 index 0000000..a01f0ca --- /dev/null +++ b/BCC-Examples/container-monitor/tui.py @@ -0,0 +1,296 @@ +"""Terminal User Interface for container monitoring.""" + +import sys +import time +import curses +from typing import Optional, List +from data_collector import ContainerDataCollector, CgroupInfo, ContainerStats + + +class ContainerMonitorTUI: + """TUI for container monitoring with cgroup selection and live graphs.""" + + def __init__(self, collector: ContainerDataCollector): + self.collector = collector + self.selected_cgroup: Optional[int] = None + self.current_screen = "selection" # "selection" or "monitoring" + self.selected_index = 0 + self.scroll_offset = 0 + + def run(self): + """Run the TUI application.""" + curses.wrapper(self._main_loop) + + def _main_loop(self, stdscr): + """Main curses loop.""" + # Configure curses + curses.curs_set(0) # Hide cursor + stdscr.nodelay(True) # Non-blocking input + stdscr.timeout(100) # Refresh every 100ms + + # Initialize colors + curses.start_color() + curses.init_pair(1, curses.COLOR_CYAN, curses.COLOR_BLACK) + curses.init_pair(2, curses.COLOR_GREEN, curses.COLOR_BLACK) + curses.init_pair(3, curses.COLOR_YELLOW, curses.COLOR_BLACK) + curses.init_pair(4, curses.COLOR_RED, curses.COLOR_BLACK) + curses.init_pair(5, curses.COLOR_MAGENTA, curses.COLOR_BLACK) + curses.init_pair(6, curses.COLOR_WHITE, curses.COLOR_BLUE) + + while True: + stdscr.clear() + + try: + if self.current_screen == "selection": + self._draw_selection_screen(stdscr) + elif self.current_screen == "monitoring": + self._draw_monitoring_screen(stdscr) + + stdscr.refresh() + + # Handle input + key = stdscr.getch() + if key != -1: + if not self._handle_input(key): + break # Exit requested + + except KeyboardInterrupt: + break + except Exception as e: + # Show error + stdscr.addstr(0, 0, f"Error: {str(e)}") + stdscr.refresh() + time.sleep(2) + + def _draw_selection_screen(self, stdscr): + """Draw the cgroup selection screen.""" + height, width = stdscr.getmaxyx() + + # Header + title = "🐳 Container Monitor - Select Cgroup" + stdscr.attron(curses.color_pair(6)) + stdscr.addstr(0, (width - len(title)) // 2, title) + stdscr.attroff(curses.color_pair(6)) + + # Instructions + instructions = "↑↓: Navigate | ENTER: Select | q: Quit | r: Refresh" + stdscr.attron(curses.color_pair(3)) + stdscr.addstr(1, (width - len(instructions)) // 2, instructions) + stdscr.attroff(curses.color_pair(3)) + + # Get cgroups + cgroups = self.collector.get_all_cgroups() + + if not cgroups: + msg = "No cgroups found. Waiting for activity..." + stdscr.attron(curses.color_pair(4)) + stdscr.addstr(height // 2, (width - len(msg)) // 2, msg) + stdscr.attroff(curses.color_pair(4)) + return + + # Sort cgroups by name + cgroups.sort(key=lambda c: c.name) + + # Adjust selection bounds + if self.selected_index >= len(cgroups): + self.selected_index = len(cgroups) - 1 + if self.selected_index < 0: + self.selected_index = 0 + + # Calculate visible range + list_height = height - 6 + if self.selected_index < self.scroll_offset: + self.scroll_offset = self.selected_index + elif self.selected_index >= self.scroll_offset + list_height: + self.scroll_offset = self.selected_index - list_height + 1 + + # Draw cgroup list + start_y = 3 + stdscr.attron(curses.color_pair(1)) + stdscr.addstr(start_y, 2, "─" * (width - 4)) + stdscr.attroff(curses.color_pair(1)) + + for i in range(list_height): + idx = self.scroll_offset + i + if idx >= len(cgroups): + break + + cgroup = cgroups[idx] + y = start_y + 1 + i + + if idx == self.selected_index: + # Highlight selected + stdscr.attron(curses.color_pair(2) | curses.A_BOLD | curses.A_REVERSE) + line = f"► {cgroup.name:<40} ID: {cgroup.id}" + stdscr.addstr(y, 2, line[:width - 4]) + stdscr.attroff(curses.color_pair(2) | curses.A_BOLD | curses.A_REVERSE) + else: + line = f" {cgroup.name:<40} ID: {cgroup.id}" + stdscr.addstr(y, 2, line[:width - 4]) + + # Footer with count + footer = f"Total cgroups: {len(cgroups)}" + stdscr.attron(curses.color_pair(1)) + stdscr.addstr(height - 2, (width - len(footer)) // 2, footer) + stdscr.attroff(curses.color_pair(1)) + + def _draw_monitoring_screen(self, stdscr): + """Draw the monitoring screen for selected cgroup.""" + height, width = stdscr.getmaxyx() + + if self.selected_cgroup is None: + return + + # Get current stats + stats = self.collector.get_stats_for_cgroup(self.selected_cgroup) + history = self.collector.get_history(self.selected_cgroup) + + # Header + title = f"📊 Monitoring: {stats.cgroup_name}" + stdscr.attron(curses.color_pair(6)) + stdscr.addstr(0, (width - len(title)) // 2, title) + stdscr.attroff(curses.color_pair(6)) + + # Instructions + instructions = "ESC/b: Back | q: Quit" + stdscr.attron(curses.color_pair(3)) + stdscr.addstr(1, (width - len(instructions)) // 2, instructions) + stdscr.attroff(curses.color_pair(3)) + + # Syscall count (large number display) + y = 3 + stdscr.attron(curses.color_pair(5) | curses.A_BOLD) + stdscr.addstr(y, 2, "SYSCALLS") + stdscr.attroff(curses.color_pair(5) | curses.A_BOLD) + + syscall_str = f"{stats.syscall_count:,}" + stdscr.attron(curses.color_pair(2) | curses.A_BOLD) + stdscr.addstr(y + 1, 4, syscall_str) + stdscr.attroff(curses.color_pair(2) | curses.A_BOLD) + + # Network I/O graphs + y = 6 + self._draw_section_header(stdscr, y, "NETWORK I/O", 1) + + # RX graph + y += 2 + rx_label = f"RX: {self._format_bytes(stats.rx_bytes)} ({stats.rx_packets:,} packets)" + stdscr.addstr(y, 2, rx_label) + if len(history) > 1: + self._draw_bar_graph(stdscr, y + 1, 2, width - 4, 3, + [s.rx_bytes for s in history], + curses.color_pair(2)) + + # TX graph + y += 5 + tx_label = f"TX: {self._format_bytes(stats.tx_bytes)} ({stats.tx_packets:,} packets)" + stdscr.addstr(y, 2, tx_label) + if len(history) > 1: + self._draw_bar_graph(stdscr, y + 1, 2, width - 4, 3, + [s.tx_bytes for s in history], + curses.color_pair(3)) + + # File I/O graphs + y += 5 + self._draw_section_header(stdscr, y, "FILE I/O", 1) + + # Read graph + y += 2 + read_label = f"READ: {self._format_bytes(stats.read_bytes)} ({stats.read_ops:,} ops)" + stdscr.addstr(y, 2, read_label) + if len(history) > 1: + self._draw_bar_graph(stdscr, y + 1, 2, width - 4, 3, + [s.read_bytes for s in history], + curses.color_pair(4)) + + # Write graph + y += 5 + write_label = f"WRITE: {self._format_bytes(stats.write_bytes)} ({stats.write_ops:,} ops)" + stdscr.addstr(y, 2, write_label) + if len(history) > 1: + self._draw_bar_graph(stdscr, y + 1, 2, width - 4, 3, + [s.write_bytes for s in history], + curses.color_pair(5)) + + def _draw_section_header(self, stdscr, y: int, title: str, color_pair: int): + """Draw a section header.""" + height, width = stdscr.getmaxyx() + stdscr.attron(curses.color_pair(color_pair) | curses.A_BOLD) + stdscr.addstr(y, 2, title) + stdscr.addstr(y, len(title) + 3, "─" * (width - len(title) - 5)) + stdscr.attroff(curses.color_pair(color_pair) | curses.A_BOLD) + + def _draw_bar_graph(self, stdscr, y: int, x: int, width: int, height: int, + data: List[float], color_pair: int): + """Draw a simple bar graph.""" + if not data or width < 2: + return + + # Normalize data to graph width + max_val = max(data) if max(data) > 0 else 1 + + # Take last 'width' data points + recent_data = data[-width:] + + # Draw bars + for row in range(height): + threshold = (height - row) / height + bar_line = "" + + for val in recent_data: + normalized = val / max_val + if normalized >= threshold: + bar_line += "█" + elif normalized >= threshold - 0.2: + bar_line += "▓" + elif normalized >= threshold - 0.4: + bar_line += "▒" + elif normalized >= threshold - 0.6: + bar_line += "░" + else: + bar_line += " " + + try: + stdscr.attron(color_pair) + stdscr.addstr(y + row, x, bar_line[:width]) + stdscr.attroff(color_pair) + except: + pass # Ignore errors at screen edges + + def _format_bytes(self, bytes_val: int) -> str: + """Format bytes into human-readable string.""" + for unit in ['B', 'KB', 'MB', 'GB', 'TB']: + if bytes_val < 1024.0: + return f"{bytes_val:.2f} {unit}" + bytes_val /= 1024. + 0 + return f"{bytes_val:.2f} PB" + + def _handle_input(self, key: int) -> bool: + """Handle keyboard input. Returns False to exit.""" + if key == ord('q') or key == ord('Q'): + return False # Exit + + if self.current_screen == "selection": + if key == curses.KEY_UP: + self.selected_index = max(0, self.selected_index - 1) + elif key == curses.KEY_DOWN: + cgroups = self.collector.get_all_cgroups() + self.selected_index = min(len(cgroups) - 1, self.selected_index + 1) + elif key == ord('\n') or key == curses.KEY_ENTER or key == 10: + # Select cgroup + cgroups = self.collector.get_all_cgroups() + if cgroups and 0 <= self.selected_index < len(cgroups): + cgroups.sort(key=lambda c: c.name) + self.selected_cgroup = cgroups[self.selected_index].id + self.current_screen = "monitoring" + elif key == ord('r') or key == ord('R'): + # Force refresh cache + self.collector._cgroup_cache_time = 0 + + elif self.current_screen == "monitoring": + if key == 27 or key == ord('b') or key == ord('B'): # ESC or 'b' + self.current_screen = "selection" + self.selected_cgroup = None + + return True # Continue running