Skip to content

System Metrics Collector

Bases: CollectorInterface

Collector for system metrics including CPU, memory, and GPU usage

Source code in tracelet/collectors/system.py
def __init__(self, collect_interval: float = 10.0):
    self.collect_interval = collect_interval
    self._stop_event = threading.Event()
    self._collection_thread = None
    self._metrics = {}
    self._nvml_initialized = False

collect()

Collect current system information and metrics

Source code in tracelet/collectors/system.py
def collect(self) -> dict[str, Any]:
    """Collect current system information and metrics"""
    system_info = {
        "platform": platform.platform(),
        "python_version": platform.python_version(),
        "cpu_count": psutil.cpu_count(),
        "cpu_count_physical": psutil.cpu_count(logical=False),
        "memory_total": psutil.virtual_memory().total,
        "hostname": platform.node(),
    }

    # Add current metrics
    metrics = self._collect_current_metrics()
    system_info.update(metrics)

    return system_info

start()

Start the background collection thread

Source code in tracelet/collectors/system.py
def start(self):
    """Start the background collection thread"""
    if self._collection_thread is None:
        self._stop_event.clear()
        self._collection_thread = threading.Thread(target=self._collection_loop, daemon=True)
        self._collection_thread.start()

stop()

Stop the background collection thread

Source code in tracelet/collectors/system.py
def stop(self):
    """Stop the background collection thread"""
    if self._collection_thread is not None:
        self._stop_event.set()
        self._collection_thread.join()
        self._collection_thread = None

    if self._nvml_initialized:
        with contextlib.suppress(Exception):
            pynvml.nvmlShutdown()

options: show_source: true show_bases: true merge_init_into_class: true heading_level: 2

Overview

The System Metrics Collector monitors system performance during experiment execution, providing insights into resource utilization and system health.

Basic Usage

import tracelet

# System metrics are collected automatically with default settings
exp = tracelet.start_logging(
    exp_name="system_monitoring_demo",
    project="performance_tracking",
    backend="mlflow"
)

# System metrics collected in background automatically
# Train your model here...

tracelet.stop_logging()

Manual System Collection

from tracelet.collectors.system import SystemMetricsCollector

# Create system collector
system_collector = SystemMetricsCollector(collect_interval=5.0)

# Initialize and start background collection
system_collector.initialize()
system_collector.start()

# Collect current metrics snapshot
current_metrics = system_collector.collect()

print("Current System Metrics:")
for category, metrics in current_metrics.items():
    print(f"  {category}: {metrics}")

# Stop collection
system_collector.stop()

Configuration Options

Collection Interval

from tracelet.settings import TraceletSettings

# Configure system metrics collection
settings = TraceletSettings(
    project="performance_monitoring",
    backend=["mlflow"],
    track_system=True,        # Enable system tracking
    metrics_interval=10.0,    # Collect every 10 seconds
    track_gpu=True,          # Include GPU metrics (if available)
    track_disk=True,         # Include disk I/O metrics
    track_network=True       # Include network I/O metrics
)

tracelet.start_logging(
    exp_name="configured_monitoring",
    settings=settings
)

Custom Collection Parameters

from tracelet.collectors.system import SystemMetricsCollector

# Custom collector configuration
collector = SystemMetricsCollector(
    collect_interval=2.0,     # Collect every 2 seconds
    include_per_cpu=True,     # Include per-CPU metrics
    include_processes=True,   # Include top processes
    process_limit=5          # Limit to top 5 processes
)

Collected Metrics

CPU Metrics

  • cpu_percent: Overall CPU utilization percentage
  • cpu_count_logical: Number of logical CPU cores
  • cpu_count_physical: Number of physical CPU cores
  • cpu_per_core: Per-core utilization (if enabled)
  • load_average: System load averages (1, 5, 15 minutes)

Memory Metrics

  • memory_total: Total system memory (bytes)
  • memory_available: Available memory (bytes)
  • memory_used: Used memory (bytes)
  • memory_percent: Memory utilization percentage
  • swap_total: Total swap space (bytes)
  • swap_used: Used swap space (bytes)

Disk Metrics

  • disk_total: Total disk space (bytes)
  • disk_used: Used disk space (bytes)
  • disk_free: Free disk space (bytes)
  • disk_percent: Disk utilization percentage
  • disk_read_bytes: Cumulative bytes read
  • disk_write_bytes: Cumulative bytes written

Network Metrics

  • network_bytes_sent: Cumulative bytes sent
  • network_bytes_recv: Cumulative bytes received
  • network_packets_sent: Cumulative packets sent
  • network_packets_recv: Cumulative packets received

GPU Metrics (if available)

  • gpu_count: Number of available GPUs
  • gpu_utilization: GPU utilization per device
  • gpu_memory_used: GPU memory usage per device
  • gpu_memory_total: Total GPU memory per device
  • gpu_temperature: GPU temperature per device

Practical Examples

Performance Monitoring During Training

import time
import tracelet
from tracelet.collectors.system import SystemMetricsCollector

# Start experiment with system monitoring
exp = tracelet.start_logging(
    exp_name="performance_monitored_training",
    project="ml_performance",
    backend="mlflow"
)

# Get system collector for manual snapshots
collector = SystemMetricsCollector(collect_interval=1.0)
collector.initialize()

# Training simulation with periodic monitoring
for epoch in range(100):
    # Simulate training work
    time.sleep(0.1)

    # Log training metrics
    train_loss = 1.0 / (epoch + 1)
    exp.log_metric("train_loss", train_loss, iteration=epoch)

    # Periodic system snapshots
    if epoch % 10 == 0:
        system_snapshot = collector.collect()

        # Log key system metrics
        exp.log_metric("cpu_percent", system_snapshot.get("cpu_percent", 0), iteration=epoch)
        exp.log_metric("memory_percent", system_snapshot.get("memory_percent", 0), iteration=epoch)

        # Log GPU metrics if available
        if "gpu" in system_snapshot:
            for i, gpu_info in enumerate(system_snapshot["gpu"]):
                exp.log_metric(f"gpu_{i}_utilization", gpu_info.get("utilization", 0), iteration=epoch)
                exp.log_metric(f"gpu_{i}_memory_percent", gpu_info.get("memory_percent", 0), iteration=epoch)

tracelet.stop_logging()

Resource Usage Analysis

import tracelet
import matplotlib.pyplot as plt
from tracelet.collectors.system import SystemMetricsCollector

# Detailed resource analysis
collector = SystemMetricsCollector(collect_interval=0.5)
collector.initialize()

# Start experiment
exp = tracelet.start_logging(
    exp_name="resource_analysis",
    project="performance_analysis",
    backend="mlflow"
)

# Collect metrics during workload
metrics_history = []
start_time = time.time()

# Simulate varying workload
for i in range(120):  # 60 seconds of collection
    # Simulate different workload intensities
    if i < 40:
        # Light workload
        time.sleep(0.1)
    elif i < 80:
        # Heavy workload simulation
        _ = [x**2 for x in range(10000)]
        time.sleep(0.3)
    else:
        # Cool down
        time.sleep(0.2)

    # Collect metrics
    metrics = collector.collect()
    metrics["timestamp"] = time.time() - start_time
    metrics_history.append(metrics)

# Analyze and plot resource usage
timestamps = [m["timestamp"] for m in metrics_history]
cpu_usage = [m.get("cpu_percent", 0) for m in metrics_history]
memory_usage = [m.get("memory_percent", 0) for m in metrics_history]

plt.figure(figsize=(12, 8))

plt.subplot(2, 1, 1)
plt.plot(timestamps, cpu_usage, label="CPU %")
plt.ylabel("CPU Usage (%)")
plt.legend()
plt.title("System Resource Usage During Experiment")

plt.subplot(2, 1, 2)
plt.plot(timestamps, memory_usage, label="Memory %", color="red")
plt.ylabel("Memory Usage (%)")
plt.xlabel("Time (seconds)")
plt.legend()

plt.tight_layout()
plt.savefig("resource_usage.png")

# Log analysis results
exp.log_artifact("resource_usage.png", "analysis/resource_usage.png")

# Log summary statistics
avg_cpu = sum(cpu_usage) / len(cpu_usage)
max_cpu = max(cpu_usage)
avg_memory = sum(memory_usage) / len(memory_usage)
max_memory = max(memory_usage)

exp.log_params({
    "avg_cpu_usage": avg_cpu,
    "max_cpu_usage": max_cpu,
    "avg_memory_usage": avg_memory,
    "max_memory_usage": max_memory
})

tracelet.stop_logging()

GPU Monitoring

import tracelet
from tracelet.collectors.system import SystemMetricsCollector

def monitor_gpu_training():
    """Monitor GPU utilization during training."""

    exp = tracelet.start_logging(
        exp_name="gpu_monitored_training",
        project="gpu_performance",
        backend="mlflow"
    )

    collector = SystemMetricsCollector(collect_interval=1.0)
    collector.initialize()

    # Simulate GPU training
    import torch

    if torch.cuda.is_available():
        device = torch.device("cuda")

        # Create model and data on GPU
        model = torch.nn.Linear(1000, 100).to(device)
        optimizer = torch.optim.Adam(model.parameters())

        for epoch in range(50):
            # Simulate training batch
            data = torch.randn(128, 1000).to(device)
            target = torch.randn(128, 100).to(device)

            optimizer.zero_grad()
            output = model(data)
            loss = torch.nn.functional.mse_loss(output, target)
            loss.backward()
            optimizer.step()

            # Log training metrics
            exp.log_metric("train_loss", loss.item(), iteration=epoch)

            # Log GPU metrics
            system_metrics = collector.collect()
            if "gpu" in system_metrics:
                for i, gpu_info in enumerate(system_metrics["gpu"]):
                    exp.log_metric(f"gpu_{i}_utilization", gpu_info.get("utilization", 0), iteration=epoch)
                    exp.log_metric(f"gpu_{i}_memory_used", gpu_info.get("memory_used", 0), iteration=epoch)
                    exp.log_metric(f"gpu_{i}_temperature", gpu_info.get("temperature", 0), iteration=epoch)
    else:
        print("No GPU available for monitoring")

    tracelet.stop_logging()

# Run GPU monitoring
monitor_gpu_training()

Advanced Features

Custom Metric Collection

from tracelet.collectors.system import SystemMetricsCollector
import psutil
import json

class ExtendedSystemCollector(SystemMetricsCollector):
    """Extended system collector with custom metrics."""

    def collect(self):
        """Collect standard metrics plus custom ones."""
        # Get base metrics
        metrics = super().collect()

        # Add custom metrics
        custom_metrics = self._collect_custom_metrics()
        metrics.update(custom_metrics)

        return metrics

    def _collect_custom_metrics(self):
        """Collect additional custom system metrics."""
        custom = {}

        # Process count by state
        processes = list(psutil.process_iter(['status']))
        status_counts = {}
        for proc in processes:
            try:
                status = proc.info['status']
                status_counts[status] = status_counts.get(status, 0) + 1
            except (psutil.NoSuchProcess, psutil.AccessDenied):
                pass

        custom["process_counts"] = status_counts
        custom["total_processes"] = len(processes)

        # Open file descriptors
        try:
            current_process = psutil.Process()
            custom["open_files"] = len(current_process.open_files())
        except (psutil.AccessDenied, AttributeError):
            custom["open_files"] = -1

        # System uptime
        boot_time = psutil.boot_time()
        uptime = time.time() - boot_time
        custom["system_uptime_hours"] = uptime / 3600

        return custom

# Usage
extended_collector = ExtendedSystemCollector(collect_interval=5.0)
extended_collector.initialize()

metrics = extended_collector.collect()
print("Extended metrics:", json.dumps(metrics, indent=2))

Threshold-Based Alerts

import tracelet
from tracelet.collectors.system import SystemMetricsCollector
import time

class AlertingSystemCollector(SystemMetricsCollector):
    """System collector with alerting capabilities."""

    def __init__(self, collect_interval=10.0, thresholds=None):
        super().__init__(collect_interval)
        self.thresholds = thresholds or {
            "cpu_percent": 90.0,
            "memory_percent": 85.0,
            "disk_percent": 95.0
        }
        self.alerts = []

    def collect(self):
        """Collect metrics and check thresholds."""
        metrics = super().collect()
        self._check_thresholds(metrics)
        return metrics

    def _check_thresholds(self, metrics):
        """Check if any metrics exceed thresholds."""
        timestamp = time.time()

        for metric_name, threshold in self.thresholds.items():
            if metric_name in metrics:
                value = metrics[metric_name]
                if value > threshold:
                    alert = {
                        "timestamp": timestamp,
                        "metric": metric_name,
                        "value": value,
                        "threshold": threshold,
                        "severity": "warning" if value < threshold * 1.1 else "critical"
                    }
                    self.alerts.append(alert)
                    print(f"ALERT: {metric_name} = {value:.1f}% (threshold: {threshold}%)")

    def get_alerts(self):
        """Get all alerts since initialization."""
        return self.alerts.copy()

# Usage with alerting
exp = tracelet.start_logging(
    exp_name="monitored_with_alerts",
    project="system_alerting",
    backend="mlflow"
)

alerting_collector = AlertingSystemCollector(
    collect_interval=2.0,
    thresholds={
        "cpu_percent": 80.0,
        "memory_percent": 75.0
    }
)

alerting_collector.initialize()
alerting_collector.start()

# Simulate workload
time.sleep(30)

# Check for alerts
alerts = alerting_collector.get_alerts()
if alerts:
    print(f"Generated {len(alerts)} alerts during experiment")

    # Log alerts as experiment metadata
    exp.log_params({
        "alert_count": len(alerts),
        "max_cpu_alert": max([a["value"] for a in alerts if a["metric"] == "cpu_percent"], default=0),
        "critical_alerts": len([a for a in alerts if a["severity"] == "critical"])
    })

alerting_collector.stop()
tracelet.stop_logging()

Error Handling

Platform Compatibility

import platform
from tracelet.collectors.system import SystemMetricsCollector

def create_platform_aware_collector():
    """Create system collector appropriate for current platform."""

    system = platform.system().lower()

    try:
        if system == "linux":
            # Linux-specific configuration
            collector = SystemMetricsCollector(
                collect_interval=5.0,
                include_per_cpu=True,
                include_processes=True
            )
        elif system == "darwin":  # macOS
            # macOS-specific configuration
            collector = SystemMetricsCollector(
                collect_interval=5.0,
                include_per_cpu=False  # Some limitations on macOS
            )
        elif system == "windows":
            # Windows-specific configuration
            collector = SystemMetricsCollector(
                collect_interval=10.0  # Slower collection on Windows
            )
        else:
            # Default configuration for unknown platforms
            collector = SystemMetricsCollector(collect_interval=10.0)

        collector.initialize()
        return collector

    except Exception as e:
        print(f"Failed to create system collector: {e}")
        return None

# Usage
collector = create_platform_aware_collector()
if collector:
    metrics = collector.collect()
    print(f"Platform: {platform.system()}")
    print(f"Available metrics: {list(metrics.keys())}")
else:
    print("System metrics collection not available on this platform")

Graceful Degradation

from tracelet.collectors.system import SystemMetricsCollector
import logging

def safe_system_collection():
    """Safely collect system metrics with fallback options."""

    try:
        # Try full system collection
        collector = SystemMetricsCollector(collect_interval=5.0)
        collector.initialize()
        metrics = collector.collect()

        # Validate critical metrics
        required_metrics = ["cpu_percent", "memory_percent"]
        missing_metrics = [m for m in required_metrics if m not in metrics]

        if missing_metrics:
            logging.warning(f"Missing critical metrics: {missing_metrics}")

        return metrics

    except ImportError as e:
        logging.error(f"Missing system monitoring dependencies: {e}")
        return {"error": "psutil not available", "basic_info": {"platform": platform.system()}}

    except Exception as e:
        logging.error(f"System collection failed: {e}")
        # Fallback to basic information
        return {
            "error": str(e),
            "fallback_metrics": {
                "timestamp": time.time(),
                "platform": platform.system(),
                "python_version": platform.python_version()
            }
        }

# Usage
system_info = safe_system_collection()

exp = tracelet.start_logging(
    exp_name="safe_monitoring",
    project="robust_experiments",
    backend="mlflow"
)

if "error" not in system_info:
    # Log successful metrics
    exp.log_params({
        "system_monitoring": "enabled",
        "cpu_cores": system_info.get("cpu_count_logical", "unknown"),
        "memory_gb": round(system_info.get("memory_total", 0) / (1024**3), 2)
    })
else:
    # Log error information
    exp.log_params({
        "system_monitoring": "failed",
        "error": system_info["error"]
    })

tracelet.stop_logging()

Best Practices

Resource-Aware Collection

from tracelet.collectors.system import SystemMetricsCollector
import psutil

def configure_adaptive_collection():
    """Configure collection based on system resources."""

    # Get system information
    memory_gb = psutil.virtual_memory().total / (1024**3)
    cpu_count = psutil.cpu_count()

    # Adapt collection based on system capacity
    if memory_gb < 4:  # Low memory system
        interval = 30.0  # Less frequent collection
        include_processes = False
    elif memory_gb < 8:  # Medium memory system
        interval = 15.0
        include_processes = True
    else:  # High memory system
        interval = 5.0
        include_processes = True

    # Adapt based on CPU count
    if cpu_count <= 2:
        include_per_cpu = False
    else:
        include_per_cpu = True

    collector = SystemMetricsCollector(
        collect_interval=interval,
        include_per_cpu=include_per_cpu,
        include_processes=include_processes
    )

    print(f"Configured system collection:")
    print(f"  Interval: {interval}s")
    print(f"  Per-CPU metrics: {include_per_cpu}")
    print(f"  Process metrics: {include_processes}")

    return collector

# Usage
adaptive_collector = configure_adaptive_collection()
adaptive_collector.initialize()

Experiment Performance Impact

import time
from tracelet.collectors.system import SystemMetricsCollector

def measure_collection_overhead():
    """Measure the performance impact of system collection."""

    # Baseline measurement without collection
    start_time = time.time()
    for i in range(1000):
        _ = [x**2 for x in range(1000)]  # Simulate work
    baseline_time = time.time() - start_time

    # Measurement with system collection
    collector = SystemMetricsCollector(collect_interval=0.1)  # Aggressive collection
    collector.initialize()
    collector.start()

    start_time = time.time()
    for i in range(1000):
        _ = [x**2 for x in range(1000)]  # Same work
    collection_time = time.time() - start_time

    collector.stop()

    overhead_percent = ((collection_time - baseline_time) / baseline_time) * 100

    print(f"Baseline time: {baseline_time:.3f}s")
    print(f"With collection: {collection_time:.3f}s")
    print(f"Overhead: {overhead_percent:.1f}%")

    return overhead_percent

# Measure and decide on collection strategy
overhead = measure_collection_overhead()

if overhead > 5.0:  # More than 5% overhead
    print("High overhead detected, using conservative collection")
    collection_interval = 30.0
else:
    print("Low overhead, using frequent collection")
    collection_interval = 5.0