Skip to content

MLflow Backend

Bases: BackendInterface

MLflow backend integration

Source code in tracelet/backends/mlflow.py
def __init__(self, tracking_uri: Optional[str] = None, experiment_name: Optional[str] = None):
    self.tracking_uri = tracking_uri
    self.experiment_name = experiment_name
    self._run = None

options: show_source: true show_bases: true merge_init_into_class: true heading_level: 2

Overview

The MLflow backend provides integration with MLflow tracking server for experiment logging and management.

Basic Usage

import tracelet

# Basic MLflow usage
tracelet.start_logging(
    exp_name="mlflow_experiment",
    project="my_project",
    backend="mlflow"
)

# Custom MLflow configuration
tracelet.start_logging(
    exp_name="custom_mlflow",
    project="my_project",
    backend="mlflow",
    config={
        "tracking_uri": "http://mlflow-server:5000",
        "experiment_name": "Deep Learning Experiments"
    }
)

Configuration Options

Via Settings

from tracelet.settings import TraceletSettings

# Configure via settings
settings = TraceletSettings(
    project="mlflow_project",
    backend=["mlflow"],
    mlflow_tracking_uri="http://localhost:5000",
    mlflow_experiment_name="ML Experiments"
)

tracelet.start_logging(exp_name="configured_exp", settings=settings)

Via Environment Variables

# MLflow-specific environment variables
export MLFLOW_TRACKING_URI="http://mlflow-server:5000"
export MLFLOW_EXPERIMENT_NAME="Production Experiments"

# Tracelet MLflow settings
export TRACELET_MLFLOW_TRACKING_URI="http://localhost:5000"
export TRACELET_MLFLOW_EXPERIMENT_NAME="Development Experiments"

Via Configuration Object

# Direct configuration
mlflow_config = {
    "tracking_uri": "http://localhost:5000",
    "experiment_name": "My Experiments",
    "run_name": "baseline_run_001",
    "tags": {
        "team": "ml_research",
        "version": "v1.0",
        "model_type": "transformer"
    }
}

tracelet.start_logging(
    exp_name="tagged_experiment",
    project="research",
    backend="mlflow",
    config=mlflow_config
)

MLflow Server Setup

Local MLflow Server

# Start local MLflow server
mlflow server --host 0.0.0.0 --port 5000

# With backend store (SQLite)
mlflow server \
    --backend-store-uri sqlite:///mlflow.db \
    --default-artifact-root ./mlflow-artifacts \
    --host 0.0.0.0 \
    --port 5000

Remote MLflow Server

# Connect to remote MLflow server
tracelet.start_logging(
    exp_name="remote_experiment",
    project="distributed_training",
    backend="mlflow",
    config={
        "tracking_uri": "http://mlflow.company.com:5000",
        "experiment_name": "Production Models"
    }
)

Docker MLflow Setup

# docker-compose.yml
version: "3.8"
services:
  mlflow:
    image: mlflow/mlflow
    ports:
      - "5000:5000"
    command: >
      mlflow server
      --backend-store-uri sqlite:///mlflow.db
      --default-artifact-root /mlflow/artifacts
      --host 0.0.0.0
      --port 5000
    volumes:
      - mlflow_data:/mlflow

volumes:
  mlflow_data:

Advanced Usage

Experiment Organization

# Organize experiments by project/team
tracelet.start_logging(
    exp_name="resnet_baseline",
    project="computer_vision",
    backend="mlflow",
    config={
        "experiment_name": "CV Team - Image Classification",
        "tags": {
            "project": "computer_vision",
            "team": "cv_team",
            "model_family": "resnet",
            "dataset": "imagenet"
        }
    }
)

Artifact Management

import tracelet
import torch

# Start experiment
exp = tracelet.start_logging(
    exp_name="artifact_demo",
    project="model_artifacts",
    backend="mlflow"
)

# Log model artifacts
torch.save(model.state_dict(), "model.pth")
exp.log_artifact("model.pth", "models/trained_model.pth")

# Log configuration files
exp.log_artifact("config.yaml", "configs/training_config.yaml")

# Log processed datasets
exp.log_artifact("train_data.csv", "datasets/processed_train.csv")

Model Registry Integration

import mlflow
import tracelet

# Register model after training
exp = tracelet.start_logging(
    exp_name="model_registry_demo",
    project="production_models",
    backend="mlflow"
)

# Train model and log metrics
# ... training code ...

# Register model in MLflow Model Registry
mlflow.pytorch.log_model(
    pytorch_model=model,
    artifact_path="model",
    registered_model_name="ProductionImageClassifier"
)

Auto-logging Integration

import mlflow
import mlflow.pytorch
import tracelet

# Enable MLflow auto-logging
mlflow.pytorch.autolog()

# Start Tracelet (will work alongside auto-logging)
tracelet.start_logging(
    exp_name="autolog_experiment",
    project="auto_tracking",
    backend="mlflow"
)

# Training code - metrics logged automatically by both systems
model = torch.nn.Linear(10, 1)
optimizer = torch.optim.Adam(model.parameters())
# ... training loop ...

Error Handling

Connection Issues

import tracelet

try:
    tracelet.start_logging(
        exp_name="connection_test",
        project="reliability",
        backend="mlflow",
        config={"tracking_uri": "http://unreachable-server:5000"}
    )
except Exception as e:
    print(f"MLflow connection failed: {e}")

    # Fallback to local tracking
    tracelet.start_logging(
        exp_name="connection_test_local",
        project="reliability",
        backend="mlflow"  # Uses default local tracking
    )

Experiment Creation

# Handle experiment name conflicts
import mlflow
import tracelet

experiment_name = "Shared Experiment"

try:
    tracelet.start_logging(
        exp_name="shared_run",
        project="collaborative",
        backend="mlflow",
        config={"experiment_name": experiment_name}
    )
except mlflow.exceptions.MlflowException as e:
    if "already exists" in str(e):
        print(f"Experiment '{experiment_name}' already exists, joining it")
        # MLflow will automatically use existing experiment
    else:
        raise e

Integration Examples

PyTorch Training Loop

import torch
import torch.nn as nn
import tracelet

# Start experiment
exp = tracelet.start_logging(
    exp_name="pytorch_training",
    project="deep_learning",
    backend="mlflow"
)

# Log hyperparameters
exp.log_params({
    "learning_rate": 0.001,
    "batch_size": 32,
    "epochs": 100,
    "model": "resnet18"
})

# Training loop
model = torch.nn.Linear(784, 10)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

for epoch in range(100):
    total_loss = 0
    for batch_idx, (data, target) in enumerate(dataloader):
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    # Log epoch metrics
    avg_loss = total_loss / len(dataloader)
    exp.log_metric("train_loss", avg_loss, iteration=epoch)

    # Save checkpoint every 10 epochs
    if epoch % 10 == 0:
        torch.save(model.state_dict(), f"checkpoint_epoch_{epoch}.pth")
        exp.log_artifact(
            f"checkpoint_epoch_{epoch}.pth",
            f"checkpoints/epoch_{epoch}.pth"
        )

tracelet.stop_logging()

Hyperparameter Optimization

import optuna
import tracelet

def objective(trial):
    # Suggest hyperparameters
    lr = trial.suggest_loguniform("lr", 1e-5, 1e-1)
    batch_size = trial.suggest_categorical("batch_size", [16, 32, 64])

    # Start experiment for this trial
    exp = tracelet.start_logging(
        exp_name=f"optuna_trial_{trial.number}",
        project="hyperparameter_optimization",
        backend="mlflow",
        config={
            "tags": {
                "optimization": "optuna",
                "trial_number": str(trial.number)
            }
        }
    )

    # Log trial parameters
    exp.log_params({
        "lr": lr,
        "batch_size": batch_size,
        "trial_number": trial.number
    })

    # Train model with suggested parameters
    accuracy = train_model(lr=lr, batch_size=batch_size, experiment=exp)

    # Log final result
    exp.log_metric("final_accuracy", accuracy)
    tracelet.stop_logging()

    return accuracy

# Run optimization
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

Best Practices

Experiment Naming

# Use descriptive experiment names
tracelet.start_logging(
    exp_name=f"resnet50_imagenet_{datetime.now().strftime('%Y%m%d_%H%M')}",
    project="computer_vision",
    backend="mlflow",
    config={
        "experiment_name": "CV Team - ImageNet Classification",
        "tags": {
            "architecture": "resnet50",
            "dataset": "imagenet",
            "date": datetime.now().isoformat()
        }
    }
)

Metric Organization

# Use hierarchical metric names
exp.log_metric("train/loss", train_loss, iteration=epoch)
exp.log_metric("train/accuracy", train_acc, iteration=epoch)
exp.log_metric("val/loss", val_loss, iteration=epoch)
exp.log_metric("val/accuracy", val_acc, iteration=epoch)
exp.log_metric("lr_schedule/learning_rate", current_lr, iteration=epoch)

Resource Management

# Ensure proper cleanup
import atexit
import tracelet

exp = tracelet.start_logging(
    exp_name="robust_experiment",
    project="production",
    backend="mlflow"
)

# Register cleanup function
atexit.register(tracelet.stop_logging)

try:
    # Training code here
    pass
finally:
    # Always stop logging
    tracelet.stop_logging()