Git Collector¶

Bases: CollectorInterface

Collector for Git repository information

Source code in tracelet/collectors/git.py

def __init__(self, repo_path: Optional[str] = None):
    self.repo_path = repo_path or os.getcwd()
    self._repo = None

options: show_source: true show_bases: true merge_init_into_class: true heading_level: 2

Overview¶

The Git Collector automatically captures git repository information for experiment reproducibility and version tracking.

Basic Usage¶

import tracelet

# Git information is collected automatically when starting experiments
exp = tracelet.start_logging(
    exp_name="git_tracking_demo",
    project="version_control",
    backend="mlflow"
)

# Git info is collected and logged automatically
# No manual intervention needed

Manual Git Collection¶

from tracelet.collectors.git import GitCollector

# Create git collector
git_collector = GitCollector()

# Initialize and collect git information
git_collector.initialize()
git_info = git_collector.collect()

print("Git Information:")
for key, value in git_info.items():
    print(f"  {key}: {value}")

Collected Information¶

The Git Collector captures the following repository information:

Repository State¶

commit_hash: Current commit SHA
branch: Current branch name
remote_url: Remote repository URL
is_dirty: Whether working directory has uncommitted changes

Commit Details¶

commit_message: Latest commit message
commit_author: Commit author name and email
commit_date: Commit timestamp

Working Directory Status¶

uncommitted_files: List of modified files (if any)
untracked_files: List of untracked files (if any)

Configuration Options¶

Custom Repository Path¶

from tracelet.collectors.git import GitCollector

# Specify custom repository path
git_collector = GitCollector(repo_path="/path/to/custom/repo")
git_collector.initialize()
git_info = git_collector.collect()

Integration with Settings¶

from tracelet.settings import TraceletSettings

# Configure git tracking behavior
settings = TraceletSettings(
    project="git_configured",
    backend=["mlflow"],
    # Git-specific settings would be added here if supported
)

tracelet.start_logging(exp_name="git_exp", settings=settings)

Practical Examples¶

Experiment Reproducibility¶

import tracelet
import json

# Start experiment with automatic git tracking
exp = tracelet.start_logging(
    exp_name="reproducible_experiment",
    project="research",
    backend="mlflow"
)

# Get git information for logging
from tracelet.collectors.git import GitCollector
git_collector = GitCollector()
git_collector.initialize()
git_info = git_collector.collect()

# Log git info as parameters for reproducibility
exp.log_params({
    "git_commit": git_info.get("commit_hash"),
    "git_branch": git_info.get("branch"),
    "git_is_dirty": git_info.get("is_dirty", False)
})

# Save detailed git info as artifact
with open("git_info.json", "w") as f:
    json.dump(git_info, f, indent=2, default=str)

exp.log_artifact("git_info.json", "metadata/git_info.json")

# Your training code here...
tracelet.stop_logging()

Pre-commit Validation¶

from tracelet.collectors.git import GitCollector
import sys

def validate_git_state():
    """Validate git state before starting experiment."""
    git_collector = GitCollector()
    git_collector.initialize()
    git_info = git_collector.collect()

    # Check for uncommitted changes
    if git_info.get("is_dirty", False):
        print("Warning: Working directory has uncommitted changes!")
        print(f"Modified files: {git_info.get('uncommitted_files', [])}")

        response = input("Continue anyway? (y/N): ")
        if response.lower() != 'y':
            print("Experiment cancelled.")
            sys.exit(1)

    # Check if on main/master branch
    current_branch = git_info.get("branch", "")
    if current_branch in ["main", "master"]:
        print(f"Warning: Running experiment on {current_branch} branch!")
        response = input("Are you sure? (y/N): ")
        if response.lower() != 'y':
            print("Experiment cancelled.")
            sys.exit(1)

    return git_info

# Usage
git_info = validate_git_state()
print(f"Starting experiment on branch: {git_info['branch']}")
print(f"Commit: {git_info['commit_hash'][:8]}")

tracelet.start_logging(
    exp_name="validated_experiment",
    project="safe_experiments",
    backend="mlflow"
)

Multi-Repository Projects¶

from tracelet.collectors.git import GitCollector
import os

def collect_multi_repo_info(repo_paths):
    """Collect git info from multiple repositories."""
    multi_repo_info = {}

    for name, path in repo_paths.items():
        if os.path.exists(path):
            git_collector = GitCollector(repo_path=path)
            git_collector.initialize()
            repo_info = git_collector.collect()
            multi_repo_info[name] = repo_info
        else:
            multi_repo_info[name] = {"error": "Repository not found"}

    return multi_repo_info

# Example usage for projects with multiple repositories
repo_paths = {
    "main_code": ".",
    "data_processing": "../data-pipeline",
    "model_configs": "../model-configs"
}

multi_git_info = collect_multi_repo_info(repo_paths)

# Start experiment
exp = tracelet.start_logging(
    exp_name="multi_repo_experiment",
    project="complex_project",
    backend="mlflow"
)

# Log git info for each repository
for repo_name, git_info in multi_git_info.items():
    if "error" not in git_info:
        exp.log_params({
            f"{repo_name}_commit": git_info.get("commit_hash"),
            f"{repo_name}_branch": git_info.get("branch"),
            f"{repo_name}_is_dirty": git_info.get("is_dirty", False)
        })

Error Handling¶

Repository Detection¶

from tracelet.collectors.git import GitCollector
import logging

def safe_git_collection(repo_path=None):
    """Safely collect git information with error handling."""
    try:
        git_collector = GitCollector(repo_path=repo_path)
        git_collector.initialize()
        return git_collector.collect()

    except Exception as e:
        logging.warning(f"Failed to collect git information: {e}")
        return {
            "error": str(e),
            "git_available": False,
            "fallback_info": {
                "timestamp": str(datetime.now()),
                "hostname": os.uname().nodename if hasattr(os, 'uname') else "unknown"
            }
        }

# Usage
git_info = safe_git_collection()

if git_info.get("git_available", True):
    print(f"Git commit: {git_info.get('commit_hash')}")
else:
    print(f"Git not available: {git_info.get('error')}")

Non-Git Directories¶

import os
from tracelet.collectors.git import GitCollector

def check_git_repository(path="."):
    """Check if directory is a git repository."""
    git_dir = os.path.join(path, ".git")
    return os.path.exists(git_dir)

# Check before initializing git collector
if check_git_repository():
    git_collector = GitCollector()
    git_collector.initialize()
    git_info = git_collector.collect()
    print("Git repository detected")
else:
    print("No git repository found")
    git_info = {"git_available": False}

Integration Patterns¶

Automatic Git Tagging¶

import subprocess
from tracelet.collectors.git import GitCollector
import tracelet

def create_experiment_tag(exp_name):
    """Create git tag for experiment."""
    tag_name = f"experiment-{exp_name}-{int(time.time())}"

    try:
        subprocess.run(["git", "tag", tag_name], check=True)
        return tag_name
    except subprocess.CalledProcessError as e:
        print(f"Failed to create git tag: {e}")
        return None

# Usage
exp_name = "important_experiment"

# Create git tag before starting experiment
tag_name = create_experiment_tag(exp_name)

exp = tracelet.start_logging(
    exp_name=exp_name,
    project="tagged_experiments",
    backend="mlflow"
)

if tag_name:
    exp.log_params({"git_tag": tag_name})

CI/CD Integration¶

import os
from tracelet.collectors.git import GitCollector

def get_ci_git_info():
    """Get git information in CI/CD environment."""
    git_info = {}

    # GitHub Actions
    if "GITHUB_ACTIONS" in os.environ:
        git_info.update({
            "ci_provider": "github_actions",
            "commit_hash": os.environ.get("GITHUB_SHA"),
            "branch": os.environ.get("GITHUB_REF_NAME"),
            "repository": os.environ.get("GITHUB_REPOSITORY"),
            "actor": os.environ.get("GITHUB_ACTOR")
        })

    # GitLab CI
    elif "GITLAB_CI" in os.environ:
        git_info.update({
            "ci_provider": "gitlab_ci",
            "commit_hash": os.environ.get("CI_COMMIT_SHA"),
            "branch": os.environ.get("CI_COMMIT_REF_NAME"),
            "repository": os.environ.get("CI_PROJECT_PATH"),
            "pipeline_id": os.environ.get("CI_PIPELINE_ID")
        })

    # Fall back to git collector
    else:
        git_collector = GitCollector()
        git_collector.initialize()
        git_info = git_collector.collect()
        git_info["ci_provider"] = "local"

    return git_info

# Usage in CI/CD pipeline
git_info = get_ci_git_info()

exp = tracelet.start_logging(
    exp_name="ci_experiment",
    project="automated_training",
    backend="mlflow"
)

exp.log_params({
    "ci_provider": git_info.get("ci_provider"),
    "commit_hash": git_info.get("commit_hash"),
    "branch": git_info.get("branch")
})

Best Practices¶

Repository Hygiene¶

from tracelet.collectors.git import GitCollector

def validate_repository_state():
    """Validate repository state before experiment."""
    git_collector = GitCollector()
    git_collector.initialize()
    git_info = git_collector.collect()

    issues = []

    # Check for uncommitted changes
    if git_info.get("is_dirty", False):
        issues.append("Uncommitted changes detected")

    # Check for untracked files
    untracked = git_info.get("untracked_files", [])
    if untracked:
        issues.append(f"Untracked files: {len(untracked)}")

    # Check branch name
    branch = git_info.get("branch", "")
    if not branch or branch == "HEAD":
        issues.append("Detached HEAD state")

    return issues, git_info

# Usage
issues, git_info = validate_repository_state()

if issues:
    print("Repository state issues:")
    for issue in issues:
        print(f"  - {issue}")

    # Log issues as experiment metadata
    exp = tracelet.start_logging(
        exp_name="experiment_with_issues",
        project="debug",
        backend="mlflow"
    )

    exp.log_params({
        "git_issues": ", ".join(issues),
        "git_validation": "failed"
    })
else:
    print("Repository state is clean")

Experiment Lineage¶

import json
from tracelet.collectors.git import GitCollector

def track_experiment_lineage(parent_commit=None):
    """Track experiment lineage through git history."""
    git_collector = GitCollector()
    git_collector.initialize()
    git_info = git_collector.collect()

    lineage_info = {
        "current_commit": git_info.get("commit_hash"),
        "current_branch": git_info.get("branch"),
        "parent_commit": parent_commit,
        "timestamp": str(datetime.now())
    }

    # Save lineage information
    with open("experiment_lineage.json", "w") as f:
        json.dump(lineage_info, f, indent=2)

    return lineage_info

# Usage
lineage = track_experiment_lineage(parent_commit="abc123def456")

exp = tracelet.start_logging(
    exp_name="lineage_tracked",
    project="experiment_lineage",
    backend="mlflow"
)

exp.log_params(lineage)
exp.log_artifact("experiment_lineage.json", "metadata/lineage.json")