Computer Vision with Modalkit¶

Deploy a computer vision model for image classification and object detection using Modalkit, demonstrating GPU acceleration, cloud storage, and production deployment patterns.

Overview¶

This tutorial covers: - Image classification with pre-trained models - Object detection with YOLO/DETR - Image preprocessing and augmentation - Cloud storage for model artifacts - Batch processing of images - Production deployment with monitoring

Project Structure¶

cv-service/
├── app.py                 # Modal app definition
├── vision_model.py        # Computer vision inference
├── utils.py               # Image processing utilities
├── modalkit.yaml          # Configuration
├── requirements.txt       # Dependencies
└── models/                # Model artifacts
    ├── classifier.pth
    └── detector.pth

1. Vision Model Implementation¶

Create vision_model.py:

from modalkit.inference_pipeline import InferencePipeline
from pydantic import BaseModel
from typing import List, Optional, Dict, Any
import torch
import torchvision.transforms as transforms
from torchvision.models import resnet50, ResNet50_Weights
from PIL import Image
import numpy as np
import cv2
import io
import base64
import json
import os

# Input/Output schemas
class ImageInput(BaseModel):
    image: str  # Base64 encoded image
    task: str = "classification"  # "classification" or "detection"
    confidence_threshold: float = 0.5
    max_detections: int = 10

class BoundingBox(BaseModel):
    x1: float
    y1: float
    x2: float
    y2: float
    confidence: float
    class_id: int
    class_name: str

class ImageOutput(BaseModel):
    task: str
    predictions: List[Dict[str, Any]]
    processing_time: float
    image_size: tuple
    model_name: str

class ComputerVisionInference(InferencePipeline):
    def __init__(self, model_name: str, all_model_data_folder: str, common_settings: dict, *args, **kwargs):
        super().__init__(model_name, all_model_data_folder, common_settings)

        self.model_config = common_settings.get(model_name, {})
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # Load classification model
        self.classification_model = self._load_classification_model()

        # Load detection model (optional)
        self.detection_model = self._load_detection_model()

        # Image preprocessing transforms
        self.transform = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])

        # Load class labels
        self.class_labels = self._load_class_labels()

        print(f"Computer Vision model loaded on {self.device}")

    def _load_classification_model(self):
        """Load classification model from cloud storage or download"""
        model_path = "/mnt/models/classifier.pth"

        if os.path.exists(model_path):
            print("Loading classification model from mounted storage")
            model = resnet50(weights=None)
            model.load_state_dict(torch.load(model_path, map_location=self.device))
        else:
            print("Loading pre-trained ResNet50 model")
            model = resnet50(weights=ResNet50_Weights.IMAGENET1K_V2)

        model.to(self.device)
        model.eval()
        return model

    def _load_detection_model(self):
        """Load object detection model (YOLO/DETR)"""
        try:
            import ultralytics
            model_path = "/mnt/models/yolov8n.pt"

            if os.path.exists(model_path):
                print("Loading YOLO model from mounted storage")
                model = ultralytics.YOLO(model_path)
            else:
                print("Loading pre-trained YOLOv8 model")
                model = ultralytics.YOLO('yolov8n.pt')

            return model
        except ImportError:
            print("YOLO not available, detection disabled")
            return None

    def _load_class_labels(self):
        """Load class labels from file or use ImageNet labels"""
        labels_path = "/mnt/models/class_labels.json"

        if os.path.exists(labels_path):
            with open(labels_path, 'r') as f:
                return json.load(f)
        else:
            # Use ImageNet labels
            return self._get_imagenet_labels()

    def _get_imagenet_labels(self):
        """Get ImageNet class labels"""
        # This would typically be loaded from a file
        # For brevity, returning a subset
        return {
            0: "tench", 1: "goldfish", 2: "great_white_shark", 3: "tiger_shark",
            4: "hammerhead", 5: "electric_ray", 6: "stingray", 7: "cock",
            8: "hen", 9: "ostrich", 10: "brambling", 11: "goldfinch",
            # ... (1000 total classes)
        }

    def _decode_image(self, image_b64: str) -> Image.Image:
        """Decode base64 image"""
        try:
            image_bytes = base64.b64decode(image_b64)
            image = Image.open(io.BytesIO(image_bytes))

            # Convert to RGB if needed
            if image.mode != 'RGB':
                image = image.convert('RGB')

            return image
        except Exception as e:
            raise ValueError(f"Failed to decode image: {str(e)}")

    def preprocess(self, input_list: List[ImageInput]) -> dict:
        """Preprocess images for batch inference"""
        import time
        start_time = time.time()

        images = []
        original_sizes = []
        tasks = []

        for input_item in input_list:
            # Decode image
            image = self._decode_image(input_item.image)
            original_sizes.append(image.size)

            # Transform for classification
            if input_item.task == "classification":
                transformed = self.transform(image)
                images.append(transformed)
            else:
                # For detection, keep original image
                images.append(np.array(image))

            tasks.append(input_item.task)

        # Stack classification images into batch tensor
        classification_images = [img for img, task in zip(images, tasks) if task == "classification"]
        if classification_images:
            classification_batch = torch.stack(classification_images).to(self.device)
        else:
            classification_batch = None

        # Keep detection images as list
        detection_images = [img for img, task in zip(images, tasks) if task == "detection"]

        preprocessing_time = time.time() - start_time

        return {
            "classification_batch": classification_batch,
            "detection_images": detection_images,
            "original_sizes": original_sizes,
            "tasks": tasks,
            "preprocessing_time": preprocessing_time
        }

    def predict(self, input_list: List[ImageInput], preprocessed_data: dict) -> dict:
        """Run inference on preprocessed images"""
        import time
        start_time = time.time()

        classification_results = []
        detection_results = []

        # Classification inference
        if preprocessed_data["classification_batch"] is not None:
            with torch.no_grad():
                outputs = self.classification_model(preprocessed_data["classification_batch"])
                probabilities = torch.nn.functional.softmax(outputs, dim=1)

                for probs in probabilities:
                    # Get top-5 predictions
                    top5_prob, top5_indices = torch.topk(probs, 5)
                    predictions = [
                        {
                            "class_id": int(idx),
                            "class_name": self.class_labels.get(int(idx), f"class_{idx}"),
                            "confidence": float(prob)
                        }
                        for idx, prob in zip(top5_indices, top5_prob)
                    ]
                    classification_results.append(predictions)

        # Detection inference
        if preprocessed_data["detection_images"] and self.detection_model:
            for image in preprocessed_data["detection_images"]:
                results = self.detection_model(image)

                detections = []
                for result in results:
                    boxes = result.boxes
                    if boxes is not None:
                        for box in boxes:
                            x1, y1, x2, y2 = box.xyxy[0].cpu().numpy()
                            confidence = float(box.conf[0])
                            class_id = int(box.cls[0])

                            detections.append({
                                "bbox": [float(x1), float(y1), float(x2), float(y2)],
                                "confidence": confidence,
                                "class_id": class_id,
                                "class_name": self.detection_model.names[class_id]
                            })

                detection_results.append(detections)

        inference_time = time.time() - start_time

        return {
            "classification_results": classification_results,
            "detection_results": detection_results,
            "inference_time": inference_time
        }

    def postprocess(self, input_list: List[ImageInput], raw_output: dict) -> List[ImageOutput]:
        """Format outputs with metadata"""
        outputs = []

        classification_idx = 0
        detection_idx = 0

        for i, input_item in enumerate(input_list):
            if input_item.task == "classification":
                predictions = raw_output["classification_results"][classification_idx]
                # Filter by confidence threshold
                filtered_predictions = [
                    pred for pred in predictions
                    if pred["confidence"] >= input_item.confidence_threshold
                ]
                classification_idx += 1
            else:  # detection
                predictions = raw_output["detection_results"][detection_idx] if detection_idx < len(raw_output["detection_results"]) else []
                # Filter by confidence threshold and max detections
                filtered_predictions = [
                    pred for pred in predictions
                    if pred["confidence"] >= input_item.confidence_threshold
                ][:input_item.max_detections]
                detection_idx += 1

            outputs.append(ImageOutput(
                task=input_item.task,
                predictions=filtered_predictions,
                processing_time=raw_output["inference_time"],
                image_size=raw_output.get("original_sizes", [(0, 0)])[i],
                model_name=self.model_config.get("model_name", "vision_model")
            ))

        return outputs

2. Configuration¶

Create modalkit.yaml:

app_settings:
  app_prefix: "cv-service"

  # Authentication
  auth_config:
    ssm_key: "/cv-service/api-key"
    auth_header: "x-api-key"

  # Container with CV dependencies
  build_config:
    image: "python:3.11"
    tag: "latest"
    workdir: "/app"
    env:
      OPENCV_VERSION: "4.8.0"
      TORCH_VERSION: "2.0.0"
    extra_run_commands:
      # Install system dependencies
      - "apt-get update && apt-get install -y libgl1-mesa-glx libglib2.0-0 libsm6 libxext6 libxrender-dev libgomp1"
      # Install PyTorch with CUDA support
      - "pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118"
      # Install OpenCV and other CV libraries
      - "pip install opencv-python pillow ultralytics"
      # Install additional ML libraries
      - "pip install scikit-image matplotlib seaborn"

  # GPU deployment for CV workloads
  deployment_config:
    gpu: "T4"  # T4 is good for most CV tasks, A10G for larger models
    concurrency_limit: 8
    container_idle_timeout: 600
    retries: 3
    memory: 16384  # 16GB RAM for image processing

    # Mount models and datasets from cloud storage
    cloud_bucket_mounts:
      - mount_point: "/mnt/models"
        bucket_name: "cv-models-bucket"
        secret: "aws-credentials"
        key_prefix: "production/"
        read_only: true
      - mount_point: "/mnt/datasets"
        bucket_name: "cv-datasets-bucket"
        secret: "aws-credentials"
        key_prefix: "validation/"
        read_only: true

    # Cache for model downloads
    volumes:
      "/tmp/model_cache": "cv-model-cache"
    volume_reload_interval_seconds: 1800  # 30 minutes

  # Batch processing for multiple images
  batch_config:
    max_batch_size: 16  # Process multiple images efficiently
    wait_ms: 150

  # Async processing for large images
  queue_config:
    backend: "taskiq"
    broker_url: "redis://redis:6379"

# Model configuration
model_settings:
  local_model_repository_folder: "./models"
  common:
    device: "cuda"
    model_cache_dir: "/tmp/model_cache"
  model_entries:
    cv_model:
      model_name: "resnet50_cv"
      num_classes: 1000
      input_size: 224
    detection_model:
      model_name: "yolov8n"
      confidence_threshold: 0.5
      iou_threshold: 0.45

Create app.py:

import modal
from modalkit.modal_service import ModalService, create_web_endpoints
from modalkit.modal_config import ModalConfig
from vision_model import ComputerVisionInference, ImageInput, ImageOutput

# Initialize Modalkit
modal_utils = ModalConfig()
app = modal.App(name=modal_utils.app_name)

# Define Modal app class
@app.cls(**modal_utils.get_app_cls_settings())
class CVApp(ModalService):
    inference_implementation = ComputerVisionInference
    model_name: str = modal.parameter(default="cv_model")
    modal_utils: ModalConfig = modal_utils

# Create endpoints
@app.function(**modal_utils.get_handler_settings())
@modal.asgi_app(**modal_utils.get_asgi_app_settings())
def web_endpoints():
    return create_web_endpoints(
        app_cls=CVApp,
        input_model=ImageInput,
        output_model=ImageOutput
    )

# Utility function for image processing
@app.function(
    gpu="T4",
    image=modal.Image.debian_slim().pip_install(
        "opencv-python", "pillow", "torch", "torchvision"
    )
)
def process_image_batch(image_paths: list):
    """Process a batch of images from cloud storage"""
    # This could be used for batch processing from S3/GCS
    pass

if __name__ == "__main__":
    with modal.enable_local_development():
        pass

4. Usage Examples¶

Image Classification¶

import requests
import base64
from PIL import Image
import io

def encode_image(image_path: str) -> str:
    """Encode image to base64 string"""
    with open(image_path, "rb") as f:
        return base64.b64encode(f.read()).decode()

# Classify an image
headers = {"x-api-key": "your-api-key"}
image_b64 = encode_image("path/to/image.jpg")

response = requests.post(
    "https://your-org--cv-service.modal.run/predict_sync",
    json={
        "image": image_b64,
        "task": "classification",
        "confidence_threshold": 0.1
    },
    headers=headers
)

result = response.json()
print("Top predictions:")
for pred in result["predictions"][:3]:
    print(f"  {pred['class_name']}: {pred['confidence']:.3f}")

Object Detection¶

import requests
import base64

# Detect objects in an image
headers = {"x-api-key": "your-api-key"}
image_b64 = encode_image("path/to/image.jpg")

response = requests.post(
    "https://your-org--cv-service.modal.run/predict_sync",
    json={
        "image": image_b64,
        "task": "detection",
        "confidence_threshold": 0.5,
        "max_detections": 10
    },
    headers=headers
)

result = response.json()
print("Detected objects:")
for detection in result["predictions"]:
    bbox = detection["bbox"]
    print(f"  {detection['class_name']}: {detection['confidence']:.3f} at [{bbox[0]:.1f}, {bbox[1]:.1f}, {bbox[2]:.1f}, {bbox[3]:.1f}]")

Batch Processing¶

import requests
import base64
import os

# Process multiple images
headers = {"x-api-key": "your-api-key"}
image_folder = "path/to/images/"

batch_requests = []
for filename in os.listdir(image_folder):
    if filename.lower().endswith(('.jpg', '.jpeg', '.png')):
        image_path = os.path.join(image_folder, filename)
        image_b64 = encode_image(image_path)
        batch_requests.append({
            "image": image_b64,
            "task": "classification",
            "confidence_threshold": 0.2
        })

response = requests.post(
    "https://your-org--cv-service.modal.run/predict_batch",
    json=batch_requests,
    headers=headers
)

results = response.json()
for i, result in enumerate(results):
    print(f"Image {i+1}: {result['predictions'][0]['class_name']} ({result['predictions'][0]['confidence']:.3f})")

5. Advanced Features¶

Custom Model Loading¶

class CustomVisionInference(ComputerVisionInference):
    def _load_classification_model(self):
        """Load custom trained model"""
        import timm

        model_path = "/mnt/models/custom_classifier.pth"
        if os.path.exists(model_path):
            # Load custom model
            model = timm.create_model('efficientnet_b0', pretrained=False, num_classes=10)
            model.load_state_dict(torch.load(model_path, map_location=self.device))
        else:
            # Fallback to pre-trained
            model = timm.create_model('efficientnet_b0', pretrained=True, num_classes=1000)

        model.to(self.device)
        model.eval()
        return model

Image Augmentation¶

def get_augmentation_transforms(self):
    """Get data augmentation transforms"""
    return transforms.Compose([
        transforms.RandomRotation(10),
        transforms.RandomHorizontalFlip(),
        transforms.ColorJitter(brightness=0.2, contrast=0.2),
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

Performance Monitoring¶

import time
import logging

logger = logging.getLogger(__name__)

class MonitoredVisionInference(ComputerVisionInference):
    def predict(self, input_list, preprocessed_data):
        start_time = time.time()

        # Log input characteristics
        logger.info(f"Processing {len(input_list)} images")
        logger.info(f"Tasks: {preprocessed_data['tasks']}")

        result = super().predict(input_list, preprocessed_data)

        # Log performance metrics
        inference_time = time.time() - start_time
        logger.info(f"Inference completed in {inference_time:.3f}s")
        logger.info(f"Average time per image: {inference_time/len(input_list):.3f}s")

        return result

6. Production Deployment¶

Multi-GPU Configuration¶

deployment_config:
  gpu: "A10G"  # More powerful GPU for large models
  concurrency_limit: 4  # Lower limit for GPU memory

batch_config:
  max_batch_size: 8  # Smaller batches for GPU memory constraints

Auto-scaling Configuration¶

deployment_config:
  concurrency_limit: 20
  container_idle_timeout: 300  # Scale down quickly

  # Use cheaper instances for light workloads
  cpu: 8.0
  memory: 32768

Model Versioning¶

model_settings:
  model_entries:
    cv_model_v1:
      model_name: "resnet50"
      checkpoint: "v1.0"
    cv_model_v2:
      model_name: "efficientnet_b3"
      checkpoint: "v2.0"

7. Error Handling and Validation¶

from modalkit.exceptions import DependencyError

def validate_image_input(self, image_b64: str) -> None:
    """Validate image input"""
    try:
        # Check if it's valid base64
        image_bytes = base64.b64decode(image_b64)

        # Check file size (max 10MB)
        if len(image_bytes) > 10 * 1024 * 1024:
            raise ValueError("Image too large (max 10MB)")

        # Check if it's a valid image
        image = Image.open(io.BytesIO(image_bytes))

        # Check dimensions
        if image.width * image.height > 10000 * 10000:
            raise ValueError("Image dimensions too large")

    except Exception as e:
        raise ValueError(f"Invalid image data: {str(e)}")

Key Features Demonstrated¶

Multi-Modal Support: Both classification and detection in one service
GPU Acceleration: Efficient GPU usage for computer vision workloads
Batch Processing: Process multiple images efficiently
Cloud Storage: Model and dataset loading from S3/GCS
Production Ready: Proper error handling, monitoring, and scaling
Flexible Input: Support for various image formats and sizes
Model Versioning: Easy model updates and A/B testing

This example shows how Modalkit simplifies deploying complex computer vision models with production-grade features and performance optimization.